From 45bf979ae54152488622df87fc829ed49a645621 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 5 Jul 2019 16:35:17 -0700 Subject: [PATCH 01/64] saving pipenv based Makefile to Makefile.pipenv, replacing with conda env based Makefile, adding pytorch to dependencies in setup. --- Makefile | 12 ++++++------ Makefile.pipenv | 27 +++++++++++++++++++++++++++ setup.py | 3 ++- 3 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 Makefile.pipenv diff --git a/Makefile b/Makefile index a159af5..6ac10e1 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ install: - pipenv install --dev -e . + pip install -e . download_models: - pipenv run python -m spacy download en - pipenv run python -m nltk.downloader averaged_perceptron_tagger + python -m spacy download en + python -m nltk.downloader averaged_perceptron_tagger clean: -rm -rf build @@ -17,11 +17,11 @@ clean_test: dist: make clean make download_models - pipenv run python setup.py bdist_wheel --dist-dir target + python setup.py bdist_wheel --dist-dir target test: make clean_test - pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds + nosetests --with-coverage --cover-html -s --verbosity=2 --cover-package=nerds lint: - pipenv run flake8 nerds --verbose + flake8 --ignore=W605,W504 --verbose nerds diff --git a/Makefile.pipenv b/Makefile.pipenv new file mode 100644 index 0000000..a159af5 --- /dev/null +++ b/Makefile.pipenv @@ -0,0 +1,27 @@ +install: + pipenv install --dev -e . + +download_models: + pipenv run python -m spacy download en + pipenv run python -m nltk.downloader averaged_perceptron_tagger + +clean: + -rm -rf build + -rm -rf target + -find . -name "__pycache__" -type d -depth -exec rm -rf {} \; + +clean_test: + -rm -rf cover + -rm .coverage + +dist: + make clean + make download_models + pipenv run python setup.py bdist_wheel --dist-dir target + +test: + make clean_test + pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds + +lint: + pipenv run flake8 nerds --verbose diff --git a/setup.py b/setup.py index edbc4dc..045b3bf 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,8 @@ 'sklearn', 'sklearn-crfsuite', 'spacy==2.0.11', - 'tensorflow' + 'tensorflow', + 'torch' ], tests_require=[ 'coverage', From 0866dd3b3c003d8c8093082bf2ad5d4462b9752d Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 23 Oct 2019 11:41:35 -0700 Subject: [PATCH 02/64] moving examples out of main nerds package --- {nerds/examples => examples}/GMB/README.md | 0 {nerds/examples => examples}/GMB/read_data.py | 0 {nerds/examples => examples}/GMB/test_models.py | 0 {nerds/examples => examples}/GMB/train.csv | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {nerds/examples => examples}/GMB/README.md (100%) rename {nerds/examples => examples}/GMB/read_data.py (100%) rename {nerds/examples => examples}/GMB/test_models.py (100%) rename {nerds/examples => examples}/GMB/train.csv (100%) diff --git a/nerds/examples/GMB/README.md b/examples/GMB/README.md similarity index 100% rename from nerds/examples/GMB/README.md rename to examples/GMB/README.md diff --git a/nerds/examples/GMB/read_data.py b/examples/GMB/read_data.py similarity index 100% rename from nerds/examples/GMB/read_data.py rename to examples/GMB/read_data.py diff --git a/nerds/examples/GMB/test_models.py b/examples/GMB/test_models.py similarity index 100% rename from nerds/examples/GMB/test_models.py rename to examples/GMB/test_models.py diff --git a/nerds/examples/GMB/train.csv b/examples/GMB/train.csv similarity index 100% rename from nerds/examples/GMB/train.csv rename to examples/GMB/train.csv From d9103c9eccbf9b33d211f1a926c133edd437c401 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 23 Oct 2019 12:05:25 -0700 Subject: [PATCH 03/64] updated conda env instructions --- README.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7b0dd81..314954e 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,29 @@ pip3 install pipenv This will make sure that `pipenv` uses your latest version of Python3, which is hopefully 3.6 or higher. Please refer to the [official website](https://docs.pipenv.org/) for more information on `pipenv`. -A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily. +A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily. Note that this is the preferred environment setup approach, the `Pipfile` and `Pipfile.lock` files ensure that you automatically have access to the installed packages in `requirements.txt` after you do a `make install` (see below). -### Makefile specifications +## Setting up the environment using `conda` + +Alternatively, if you are using the [Anaconda distribution of Python](https://www.anaconda.com/), you can also use `conda` to create an environment using the following command: + +``` +conda create -n nerds python=3.6 anaconda +``` + +You can then enter the newly created conda environment using the following command. After you run the various `make ...` commands, the packages listed in `requirements.txt` and the downloaded models will only be visible inside the `nerds` environment. This approach is usually preferred since it can help prevent version collisions between different environments, at the cost of more disk space. + +``` +conda activate nerds +``` + +and exit the environment using the following command. + +``` +conda deactivate +``` + +## Makefile specifications To install all of the required packages for development and testing run: From 1d05a78464e349b0964f1f278c8d7a2c4de50893 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 28 Oct 2019 16:43:09 -0700 Subject: [PATCH 04/64] NERDS-4: Multi-class dictionary NER --- nerds/core/model/ner/dictionary.py | 72 ++++++++++++++++++++ nerds/test/data/dictionary/biodictionary.txt | 2 + nerds/test/test_dictionary_ner_model.py | 23 ++++++- nerds/util/convert.py | 60 ++++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 nerds/test/data/dictionary/biodictionary.txt diff --git a/nerds/core/model/ner/dictionary.py b/nerds/core/model/ner/dictionary.py index b8ca078..0ba77a5 100644 --- a/nerds/core/model/ner/dictionary.py +++ b/nerds/core/model/ner/dictionary.py @@ -76,3 +76,75 @@ def transform(self, X, y=None): encoding=document.encoding)) return annotated_documents + + +class ExactMatchMultiClassDictionaryNER(NERModel): + + def __init__(self, path_to_dictionary_file): + super().__init__() + self.key = "em-dict" + + if path_to_dictionary_file is not None: + self.path_to_dictionary_file = path_to_dictionary_file + self._create_automaton() + else: + # Must get a dictionary as an input! + log.warning("No path to dictionary provided, fit() model to load") + + def _create_automaton(self): + + if not isfile(self.path_to_dictionary_file): + raise Exception("%s is not a file." % self.path_to_dictionary_file) + + # Initialize automaton. + self.automaton = ahocorasick.Automaton() + + # Dictionary must be one word per line. + log.debug("Started loading dictionary at {}".format( + self.path_to_dictionary_file)) + with open(self.path_to_dictionary_file, 'r') as dict_file: + for line in dict_file: + search_expr, entity_type = line.strip().split('\t') + if search_expr != "": + self.automaton.add_word(search_expr, (entity_type, search_expr)) + log.debug("Successfully loaded dictionary") + + self.automaton.make_automaton() + + def transform(self, X, y=None): + """ Annotates the list of `Document` objects that are provided as + input and returns a list of `AnnotatedDocument` objects. + + In a dictionary based approach, a dictionary of keywords is used + to create a FSA which is then used to search with. See [1]. + [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + """ + annotated_documents = [] + for document in X: + annotations = [] + doc_content_str = document.plain_text_ + for item in self.automaton.iter(doc_content_str): + end_position, (label, word) = item + + start_position = (end_position - len(word) + 1) + end_position = end_position + 1 + + # Aho-Corasick matches partial strings in the input document, which + # leads to spurious matches, so we check to see that the match spans + # a full word before adding it to our list of valid annotations + if ((start_position <= 0 and doc_content_str[end_position] == " ") or + (end_position >= len(doc_content_str) and doc_content_str[start_position - 1] == " ") or + (doc_content_str[start_position - 1] == " " and doc_content_str[end_position] == " ")): + annotations.append(Annotation( + word, + label, + (start_position, end_position))) + + annotated_documents.append(AnnotatedDocument( + document.content, + annotations=annotations, + encoding=document.encoding)) + + return annotated_documents + + diff --git a/nerds/test/data/dictionary/biodictionary.txt b/nerds/test/data/dictionary/biodictionary.txt new file mode 100644 index 0000000..c332f84 --- /dev/null +++ b/nerds/test/data/dictionary/biodictionary.txt @@ -0,0 +1,2 @@ +HUMARA loci DNA +purified eosinophils cell-type diff --git a/nerds/test/test_dictionary_ner_model.py b/nerds/test/test_dictionary_ner_model.py index bffeea5..7ce6a64 100644 --- a/nerds/test/test_dictionary_ner_model.py +++ b/nerds/test/test_dictionary_ner_model.py @@ -1,7 +1,8 @@ -from nose.tools import assert_equal, assert_in +from nose.tools import assert_equal, assert_in, assert_true from nerds.core.model.input.document import Document from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER +from nerds.core.model.ner.dictionary import ExactMatchMultiClassDictionaryNER def test_ExactMatchDictionaryNER(): @@ -23,3 +24,23 @@ def test_ExactMatchDictionaryNER(): assert_in("Elsevier", unique_annotations) assert_in("Springer", unique_annotations) assert_in("Wiley", unique_annotations) + +def test_ExactMatchMultiClassDictionaryNER(): + document = Document(b""" + In this study , we have used the polymerase chain reaction ( PCR ) with nested + primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils + from female patients with eosinophilia . + """) + ner = ExactMatchMultiClassDictionaryNER( + "nerds/test/data/dictionary/biodictionary.txt") + annotated = ner.transform([document]) + expected_labels = ["DNA", "cell-type"] + for i, annotation in enumerate(annotated[0].annotations): + pred_text = annotation.text + pred_offsets = annotation.offset + label_text = document.plain_text_[pred_offsets[0]:pred_offsets[1]] + assert_equal(pred_text, label_text, + "predicted {:s} != label {:s}".format(pred_text, label_text)) + assert_equal(annotation.label, expected_labels[i]) + + diff --git a/nerds/util/convert.py b/nerds/util/convert.py index d151dc1..051c210 100644 --- a/nerds/util/convert.py +++ b/nerds/util/convert.py @@ -100,6 +100,7 @@ def transform_annotated_document_to_bio_format( tokens += non_tagged_tokens labels += non_tagged_labels + return tokens, labels @@ -202,6 +203,65 @@ def transform_bio_tags_to_annotated_document(tokens, bio_tags, document): document.content, annotations=annotations, encoding=document.encoding) +def transform_annotated_documents_to_multiclass_dictionary( + annotated_documents, dict_filename, + stopwords=None, write_entity_type=True): + """ Convert a collection of AnnotatedDocument objects to (phrase, + entity_type) tuples and writes them out to dict_filename. + + Args: + annotated_documents -- collection of AnnotatedDocument objects + dict_filename -- path to dictionary file to create + stopwords -- specify set of phrases (usually english stopwords) + that should not be marked up as entities. Default = None + implies no stopword filtering + write_entity_type -- if True, writes out entities as TSV (phrase, + entity_type), else writes out just the phrase, one per line. + Former format suitable for ExactMatchMultiClassDictionaryNER, + latter format suitable for ExactMatchDictionaryNER. + + Returns: + None + """ + + fdict = open(dict_filename, "w") + for annotated_document in annotated_documents: + tokens, tags = transform_annotated_document_to_bio_format(annotated_document) + phrase_tokens, prev_tag, already_seen_phrases = [], None, set() + for token, tag in zip(tokens, tags): + # print("token:", token, "tag:", tag) + if tag == "O": + if len(phrase_tokens) > 0: + phrase = " ".join(phrase_tokens) + prev_tag = prev_tag[2:] # remove B_ and I_ prefix + # print("... phrase:", phrase, "tag:", prev_tag) + if phrase not in already_seen_phrases: + if stopwords is not None and phrase not in stopwords: + if write_entity_type: + fdict.write("{:s}\t{:s}\n".format(phrase, prev_tag)) + else: + fdict.write("{:s}\n".format(phrase)) + already_seen_phrases.add(phrase) + phrase_tokens, prev_tag = [], None + continue + else: + phrase_tokens.append(token) + prev_tag = tag + + if len(phrase_tokens) > 0: + phrase = " ".join(phrase_tokens) + prev_tag = prev_tag[2:] # remove B_ and I_ prefix + # print("... (last) phrase:", phrase, "tag:", prev_tag) + if phrase not in already_seen_phrases: + if stopwords is not None and phrase not in stopwords: + if write_entity_type: + fdict.write("{:s}\t{:s}\n".format(phrase, prev_tag)) + else: + fdict.write("{:s}\n".format(phrase)) + + fdict.close() + + def split_annotated_documents( annotated_documents, splitter=document_to_sentences): """ Wrapper function that applies `split_annotated_document` to a From 5ca6a4e778703ec1d0032abc80c716060a80a490 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 28 Oct 2019 17:48:09 -0700 Subject: [PATCH 05/64] NERDS-6: convenience method classification_report --- nerds/core/model/evaluate/score.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/nerds/core/model/evaluate/score.py b/nerds/core/model/evaluate/score.py index b20a099..cdd9d9a 100644 --- a/nerds/core/model/evaluate/score.py +++ b/nerds/core/model/evaluate/score.py @@ -63,3 +63,29 @@ def calculate_precision_recall_f1score(y_pred, y_true, entity_label=None): (precision + recall) > 0 else 0. return (precision, recall, f1_score) + + +def classification_report(y_pred, y_true, entity_labels): + """ Pretty prints a classification report based on precision, + recall, and f1-scores from `calculate_precision_recall_f1score` + for each entity label supplied and the aggregate. + + Args: + y_pred (list(AnnotatedDocument)): The predictions of an NER + model in the form of a list of annotated documents. + y_true (list(AnnotatedDocument)): The ground truth set of + annotated documents. + entity_labels (list(str)): The entity labels for which + the scores are calculated. + + Returns: + None + """ + + print(" precision recall f1-score") + for l in sorted(entity_labels): + p, r, f = calculate_precision_recall_f1score(y_pred, y_true, entity_label=l) + print("{:20s} {:.3f} {:.3f} {:.3f}".format(l, p, r, f)) + p, r, f = calculate_precision_recall_f1score(y_pred, y_true) + print("") + print("{:20s} {:.3f} {:.3f} {:.3f}".format("--all--", p, r, f)) From 98f8797762ab87b9844e7813a712835350d64b3d Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 29 Oct 2019 08:08:25 -0700 Subject: [PATCH 06/64] NERDS-4: added a fit() method to ExactMatchMultiClassDictionaryNER --- nerds/core/model/ner/dictionary.py | 15 ++++++++++++++ nerds/test/test_dictionary_ner_model.py | 26 ++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/nerds/core/model/ner/dictionary.py b/nerds/core/model/ner/dictionary.py index 0ba77a5..270f358 100644 --- a/nerds/core/model/ner/dictionary.py +++ b/nerds/core/model/ner/dictionary.py @@ -111,6 +111,21 @@ def _create_automaton(self): self.automaton.make_automaton() + def fit(self, X, y=None): + # Initialize automaton. + self.automaton = ahocorasick.Automaton() + + # populate automaton from annotation values provided + for annotated_document in X: + for annotation in annotated_document.annotations: + search_expr = annotation.text + entity_type = annotation.label + if search_expr != "": + self.automaton.add_word(search_expr, (entity_type, search_expr)) + log.debug("Successfully loaded dictionary") + + self.automaton.make_automaton() + def transform(self, X, y=None): """ Annotates the list of `Document` objects that are provided as input and returns a list of `AnnotatedDocument` objects. diff --git a/nerds/test/test_dictionary_ner_model.py b/nerds/test/test_dictionary_ner_model.py index 7ce6a64..db9f67b 100644 --- a/nerds/test/test_dictionary_ner_model.py +++ b/nerds/test/test_dictionary_ner_model.py @@ -1,6 +1,7 @@ from nose.tools import assert_equal, assert_in, assert_true -from nerds.core.model.input.document import Document +from nerds.core.model.input.annotation import Annotation +from nerds.core.model.input.document import AnnotatedDocument, Document from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER from nerds.core.model.ner.dictionary import ExactMatchMultiClassDictionaryNER @@ -25,6 +26,7 @@ def test_ExactMatchDictionaryNER(): assert_in("Springer", unique_annotations) assert_in("Wiley", unique_annotations) + def test_ExactMatchMultiClassDictionaryNER(): document = Document(b""" In this study , we have used the polymerase chain reaction ( PCR ) with nested @@ -44,3 +46,25 @@ def test_ExactMatchMultiClassDictionaryNER(): assert_equal(annotation.label, expected_labels[i]) +def test_ExactMatchMultiClassDictionaryNER2(): + documents = [ + AnnotatedDocument(b""" + In this study , we have used the polymerase chain reaction ( PCR ) with nested + primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils + from female patients with eosinophilia . + """, annotations= [ + Annotation("HUMARA loci", "DNA", (139, 150)), + Annotation("purified eosinophils", "cell-type", (154, 174)) + ])] + ner = ExactMatchMultiClassDictionaryNER( + "nerds/test/data/dictionary/biodictionary.txt") + ner.fit(documents) + pred_documents = ner.transform(documents) + for i, annotation in enumerate(pred_documents[0].annotations): + pred_text = annotation.text + pred_offsets = annotation.offset + label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]] + assert_equal(pred_text, label_text, + "predicted {:s} != label {:s}".format(pred_text, label_text)) + assert_equal(annotation.label, expected_labels[i]) + From ce26f2f8e56ea26acf5976e2cc7c572cb4e6543a Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 12 Nov 2019 13:07:49 -0800 Subject: [PATCH 07/64] moving the current nerds package to nerds_orig --- {nerds => nerds_orig}/__init__.py | 0 {nerds => nerds_orig}/core/__init__.py | 0 {nerds => nerds_orig}/core/model/__init__.py | 0 {nerds => nerds_orig}/core/model/config/__init__.py | 0 {nerds => nerds_orig}/core/model/config/base.py | 0 {nerds => nerds_orig}/core/model/config/bilstm.py | 0 {nerds => nerds_orig}/core/model/config/crf.py | 0 {nerds => nerds_orig}/core/model/config/ensemble.py | 0 {nerds => nerds_orig}/core/model/config/error.py | 0 {nerds => nerds_orig}/core/model/config/spacy.py | 0 {nerds => nerds_orig}/core/model/evaluate/__init__.py | 0 {nerds => nerds_orig}/core/model/evaluate/score.py | 0 {nerds => nerds_orig}/core/model/evaluate/validation.py | 0 {nerds => nerds_orig}/core/model/input/__init__.py | 0 {nerds => nerds_orig}/core/model/input/annotation.py | 0 {nerds => nerds_orig}/core/model/input/base.py | 0 {nerds => nerds_orig}/core/model/input/brat.py | 0 {nerds => nerds_orig}/core/model/input/document.py | 0 {nerds => nerds_orig}/core/model/ner/__init__.py | 0 {nerds => nerds_orig}/core/model/ner/base.py | 0 {nerds => nerds_orig}/core/model/ner/bilstm.py | 0 {nerds => nerds_orig}/core/model/ner/crf.py | 0 {nerds => nerds_orig}/core/model/ner/dictionary.py | 0 {nerds => nerds_orig}/core/model/ner/ensemble.py | 0 {nerds => nerds_orig}/core/model/ner/spacy.py | 0 {nerds => nerds_orig}/core/model/optimize/__init__.py | 0 {nerds => nerds_orig}/core/model/optimize/optimizer.py | 0 {nerds => nerds_orig}/core/model/optimize/params.py | 0 {nerds => nerds_orig}/core/model/output/__init__.py | 0 {nerds => nerds_orig}/core/model/output/brat.py | 0 {nerds => nerds_orig}/test/__init__.py | 0 {nerds => nerds_orig}/test/data/brat/file1.ann | 0 {nerds => nerds_orig}/test/data/brat/file1.txt | 0 {nerds => nerds_orig}/test/data/config/sample.yaml | 0 {nerds => nerds_orig}/test/data/config/sample_error.yaml | 0 {nerds => nerds_orig}/test/data/dictionary/biodictionary.txt | 0 {nerds => nerds_orig}/test/data/dictionary/orgdictionary.txt | 0 {nerds => nerds_orig}/test/data/not_annotated/file1.txt | 0 {nerds => nerds_orig}/test/data/not_annotated/file2.txt | 0 {nerds => nerds_orig}/test/test_annotation.py | 0 {nerds => nerds_orig}/test/test_base_config.py | 0 {nerds => nerds_orig}/test/test_base_ner_class.py | 0 {nerds => nerds_orig}/test/test_bilstm_ner_model.py | 0 {nerds => nerds_orig}/test/test_brat.py | 0 {nerds => nerds_orig}/test/test_crf_ner_model.py | 0 {nerds => nerds_orig}/test/test_dictionary_ner_model.py | 0 {nerds => nerds_orig}/test/test_document.py | 0 {nerds => nerds_orig}/test/test_ensemble.py | 0 {nerds => nerds_orig}/test/test_ensemble_config.py | 0 {nerds => nerds_orig}/test/test_eval_scoring.py | 0 {nerds => nerds_orig}/test/test_kfold_cv.py | 0 {nerds => nerds_orig}/test/test_ner_model_optimizer.py | 0 {nerds => nerds_orig}/test/test_pipeline.py | 0 {nerds => nerds_orig}/test/test_spacy_ner_model.py | 0 {nerds => nerds_orig}/test/test_util_convert.py | 0 {nerds => nerds_orig}/test/test_util_file.py | 0 {nerds => nerds_orig}/test/test_util_nlp.py | 0 {nerds => nerds_orig}/test/test_util_string.py | 0 {nerds => nerds_orig}/util/__init__.py | 0 {nerds => nerds_orig}/util/convert.py | 0 {nerds => nerds_orig}/util/file.py | 0 {nerds => nerds_orig}/util/logging.py | 0 {nerds => nerds_orig}/util/nlp.py | 0 {nerds => nerds_orig}/util/string.py | 0 64 files changed, 0 insertions(+), 0 deletions(-) rename {nerds => nerds_orig}/__init__.py (100%) rename {nerds => nerds_orig}/core/__init__.py (100%) rename {nerds => nerds_orig}/core/model/__init__.py (100%) rename {nerds => nerds_orig}/core/model/config/__init__.py (100%) rename {nerds => nerds_orig}/core/model/config/base.py (100%) rename {nerds => nerds_orig}/core/model/config/bilstm.py (100%) rename {nerds => nerds_orig}/core/model/config/crf.py (100%) rename {nerds => nerds_orig}/core/model/config/ensemble.py (100%) rename {nerds => nerds_orig}/core/model/config/error.py (100%) rename {nerds => nerds_orig}/core/model/config/spacy.py (100%) rename {nerds => nerds_orig}/core/model/evaluate/__init__.py (100%) rename {nerds => nerds_orig}/core/model/evaluate/score.py (100%) rename {nerds => nerds_orig}/core/model/evaluate/validation.py (100%) rename {nerds => nerds_orig}/core/model/input/__init__.py (100%) rename {nerds => nerds_orig}/core/model/input/annotation.py (100%) rename {nerds => nerds_orig}/core/model/input/base.py (100%) rename {nerds => nerds_orig}/core/model/input/brat.py (100%) rename {nerds => nerds_orig}/core/model/input/document.py (100%) rename {nerds => nerds_orig}/core/model/ner/__init__.py (100%) rename {nerds => nerds_orig}/core/model/ner/base.py (100%) rename {nerds => nerds_orig}/core/model/ner/bilstm.py (100%) rename {nerds => nerds_orig}/core/model/ner/crf.py (100%) rename {nerds => nerds_orig}/core/model/ner/dictionary.py (100%) rename {nerds => nerds_orig}/core/model/ner/ensemble.py (100%) rename {nerds => nerds_orig}/core/model/ner/spacy.py (100%) rename {nerds => nerds_orig}/core/model/optimize/__init__.py (100%) rename {nerds => nerds_orig}/core/model/optimize/optimizer.py (100%) rename {nerds => nerds_orig}/core/model/optimize/params.py (100%) rename {nerds => nerds_orig}/core/model/output/__init__.py (100%) rename {nerds => nerds_orig}/core/model/output/brat.py (100%) rename {nerds => nerds_orig}/test/__init__.py (100%) rename {nerds => nerds_orig}/test/data/brat/file1.ann (100%) rename {nerds => nerds_orig}/test/data/brat/file1.txt (100%) rename {nerds => nerds_orig}/test/data/config/sample.yaml (100%) rename {nerds => nerds_orig}/test/data/config/sample_error.yaml (100%) rename {nerds => nerds_orig}/test/data/dictionary/biodictionary.txt (100%) rename {nerds => nerds_orig}/test/data/dictionary/orgdictionary.txt (100%) rename {nerds => nerds_orig}/test/data/not_annotated/file1.txt (100%) rename {nerds => nerds_orig}/test/data/not_annotated/file2.txt (100%) rename {nerds => nerds_orig}/test/test_annotation.py (100%) rename {nerds => nerds_orig}/test/test_base_config.py (100%) rename {nerds => nerds_orig}/test/test_base_ner_class.py (100%) rename {nerds => nerds_orig}/test/test_bilstm_ner_model.py (100%) rename {nerds => nerds_orig}/test/test_brat.py (100%) rename {nerds => nerds_orig}/test/test_crf_ner_model.py (100%) rename {nerds => nerds_orig}/test/test_dictionary_ner_model.py (100%) rename {nerds => nerds_orig}/test/test_document.py (100%) rename {nerds => nerds_orig}/test/test_ensemble.py (100%) rename {nerds => nerds_orig}/test/test_ensemble_config.py (100%) rename {nerds => nerds_orig}/test/test_eval_scoring.py (100%) rename {nerds => nerds_orig}/test/test_kfold_cv.py (100%) rename {nerds => nerds_orig}/test/test_ner_model_optimizer.py (100%) rename {nerds => nerds_orig}/test/test_pipeline.py (100%) rename {nerds => nerds_orig}/test/test_spacy_ner_model.py (100%) rename {nerds => nerds_orig}/test/test_util_convert.py (100%) rename {nerds => nerds_orig}/test/test_util_file.py (100%) rename {nerds => nerds_orig}/test/test_util_nlp.py (100%) rename {nerds => nerds_orig}/test/test_util_string.py (100%) rename {nerds => nerds_orig}/util/__init__.py (100%) rename {nerds => nerds_orig}/util/convert.py (100%) rename {nerds => nerds_orig}/util/file.py (100%) rename {nerds => nerds_orig}/util/logging.py (100%) rename {nerds => nerds_orig}/util/nlp.py (100%) rename {nerds => nerds_orig}/util/string.py (100%) diff --git a/nerds/__init__.py b/nerds_orig/__init__.py similarity index 100% rename from nerds/__init__.py rename to nerds_orig/__init__.py diff --git a/nerds/core/__init__.py b/nerds_orig/core/__init__.py similarity index 100% rename from nerds/core/__init__.py rename to nerds_orig/core/__init__.py diff --git a/nerds/core/model/__init__.py b/nerds_orig/core/model/__init__.py similarity index 100% rename from nerds/core/model/__init__.py rename to nerds_orig/core/model/__init__.py diff --git a/nerds/core/model/config/__init__.py b/nerds_orig/core/model/config/__init__.py similarity index 100% rename from nerds/core/model/config/__init__.py rename to nerds_orig/core/model/config/__init__.py diff --git a/nerds/core/model/config/base.py b/nerds_orig/core/model/config/base.py similarity index 100% rename from nerds/core/model/config/base.py rename to nerds_orig/core/model/config/base.py diff --git a/nerds/core/model/config/bilstm.py b/nerds_orig/core/model/config/bilstm.py similarity index 100% rename from nerds/core/model/config/bilstm.py rename to nerds_orig/core/model/config/bilstm.py diff --git a/nerds/core/model/config/crf.py b/nerds_orig/core/model/config/crf.py similarity index 100% rename from nerds/core/model/config/crf.py rename to nerds_orig/core/model/config/crf.py diff --git a/nerds/core/model/config/ensemble.py b/nerds_orig/core/model/config/ensemble.py similarity index 100% rename from nerds/core/model/config/ensemble.py rename to nerds_orig/core/model/config/ensemble.py diff --git a/nerds/core/model/config/error.py b/nerds_orig/core/model/config/error.py similarity index 100% rename from nerds/core/model/config/error.py rename to nerds_orig/core/model/config/error.py diff --git a/nerds/core/model/config/spacy.py b/nerds_orig/core/model/config/spacy.py similarity index 100% rename from nerds/core/model/config/spacy.py rename to nerds_orig/core/model/config/spacy.py diff --git a/nerds/core/model/evaluate/__init__.py b/nerds_orig/core/model/evaluate/__init__.py similarity index 100% rename from nerds/core/model/evaluate/__init__.py rename to nerds_orig/core/model/evaluate/__init__.py diff --git a/nerds/core/model/evaluate/score.py b/nerds_orig/core/model/evaluate/score.py similarity index 100% rename from nerds/core/model/evaluate/score.py rename to nerds_orig/core/model/evaluate/score.py diff --git a/nerds/core/model/evaluate/validation.py b/nerds_orig/core/model/evaluate/validation.py similarity index 100% rename from nerds/core/model/evaluate/validation.py rename to nerds_orig/core/model/evaluate/validation.py diff --git a/nerds/core/model/input/__init__.py b/nerds_orig/core/model/input/__init__.py similarity index 100% rename from nerds/core/model/input/__init__.py rename to nerds_orig/core/model/input/__init__.py diff --git a/nerds/core/model/input/annotation.py b/nerds_orig/core/model/input/annotation.py similarity index 100% rename from nerds/core/model/input/annotation.py rename to nerds_orig/core/model/input/annotation.py diff --git a/nerds/core/model/input/base.py b/nerds_orig/core/model/input/base.py similarity index 100% rename from nerds/core/model/input/base.py rename to nerds_orig/core/model/input/base.py diff --git a/nerds/core/model/input/brat.py b/nerds_orig/core/model/input/brat.py similarity index 100% rename from nerds/core/model/input/brat.py rename to nerds_orig/core/model/input/brat.py diff --git a/nerds/core/model/input/document.py b/nerds_orig/core/model/input/document.py similarity index 100% rename from nerds/core/model/input/document.py rename to nerds_orig/core/model/input/document.py diff --git a/nerds/core/model/ner/__init__.py b/nerds_orig/core/model/ner/__init__.py similarity index 100% rename from nerds/core/model/ner/__init__.py rename to nerds_orig/core/model/ner/__init__.py diff --git a/nerds/core/model/ner/base.py b/nerds_orig/core/model/ner/base.py similarity index 100% rename from nerds/core/model/ner/base.py rename to nerds_orig/core/model/ner/base.py diff --git a/nerds/core/model/ner/bilstm.py b/nerds_orig/core/model/ner/bilstm.py similarity index 100% rename from nerds/core/model/ner/bilstm.py rename to nerds_orig/core/model/ner/bilstm.py diff --git a/nerds/core/model/ner/crf.py b/nerds_orig/core/model/ner/crf.py similarity index 100% rename from nerds/core/model/ner/crf.py rename to nerds_orig/core/model/ner/crf.py diff --git a/nerds/core/model/ner/dictionary.py b/nerds_orig/core/model/ner/dictionary.py similarity index 100% rename from nerds/core/model/ner/dictionary.py rename to nerds_orig/core/model/ner/dictionary.py diff --git a/nerds/core/model/ner/ensemble.py b/nerds_orig/core/model/ner/ensemble.py similarity index 100% rename from nerds/core/model/ner/ensemble.py rename to nerds_orig/core/model/ner/ensemble.py diff --git a/nerds/core/model/ner/spacy.py b/nerds_orig/core/model/ner/spacy.py similarity index 100% rename from nerds/core/model/ner/spacy.py rename to nerds_orig/core/model/ner/spacy.py diff --git a/nerds/core/model/optimize/__init__.py b/nerds_orig/core/model/optimize/__init__.py similarity index 100% rename from nerds/core/model/optimize/__init__.py rename to nerds_orig/core/model/optimize/__init__.py diff --git a/nerds/core/model/optimize/optimizer.py b/nerds_orig/core/model/optimize/optimizer.py similarity index 100% rename from nerds/core/model/optimize/optimizer.py rename to nerds_orig/core/model/optimize/optimizer.py diff --git a/nerds/core/model/optimize/params.py b/nerds_orig/core/model/optimize/params.py similarity index 100% rename from nerds/core/model/optimize/params.py rename to nerds_orig/core/model/optimize/params.py diff --git a/nerds/core/model/output/__init__.py b/nerds_orig/core/model/output/__init__.py similarity index 100% rename from nerds/core/model/output/__init__.py rename to nerds_orig/core/model/output/__init__.py diff --git a/nerds/core/model/output/brat.py b/nerds_orig/core/model/output/brat.py similarity index 100% rename from nerds/core/model/output/brat.py rename to nerds_orig/core/model/output/brat.py diff --git a/nerds/test/__init__.py b/nerds_orig/test/__init__.py similarity index 100% rename from nerds/test/__init__.py rename to nerds_orig/test/__init__.py diff --git a/nerds/test/data/brat/file1.ann b/nerds_orig/test/data/brat/file1.ann similarity index 100% rename from nerds/test/data/brat/file1.ann rename to nerds_orig/test/data/brat/file1.ann diff --git a/nerds/test/data/brat/file1.txt b/nerds_orig/test/data/brat/file1.txt similarity index 100% rename from nerds/test/data/brat/file1.txt rename to nerds_orig/test/data/brat/file1.txt diff --git a/nerds/test/data/config/sample.yaml b/nerds_orig/test/data/config/sample.yaml similarity index 100% rename from nerds/test/data/config/sample.yaml rename to nerds_orig/test/data/config/sample.yaml diff --git a/nerds/test/data/config/sample_error.yaml b/nerds_orig/test/data/config/sample_error.yaml similarity index 100% rename from nerds/test/data/config/sample_error.yaml rename to nerds_orig/test/data/config/sample_error.yaml diff --git a/nerds/test/data/dictionary/biodictionary.txt b/nerds_orig/test/data/dictionary/biodictionary.txt similarity index 100% rename from nerds/test/data/dictionary/biodictionary.txt rename to nerds_orig/test/data/dictionary/biodictionary.txt diff --git a/nerds/test/data/dictionary/orgdictionary.txt b/nerds_orig/test/data/dictionary/orgdictionary.txt similarity index 100% rename from nerds/test/data/dictionary/orgdictionary.txt rename to nerds_orig/test/data/dictionary/orgdictionary.txt diff --git a/nerds/test/data/not_annotated/file1.txt b/nerds_orig/test/data/not_annotated/file1.txt similarity index 100% rename from nerds/test/data/not_annotated/file1.txt rename to nerds_orig/test/data/not_annotated/file1.txt diff --git a/nerds/test/data/not_annotated/file2.txt b/nerds_orig/test/data/not_annotated/file2.txt similarity index 100% rename from nerds/test/data/not_annotated/file2.txt rename to nerds_orig/test/data/not_annotated/file2.txt diff --git a/nerds/test/test_annotation.py b/nerds_orig/test/test_annotation.py similarity index 100% rename from nerds/test/test_annotation.py rename to nerds_orig/test/test_annotation.py diff --git a/nerds/test/test_base_config.py b/nerds_orig/test/test_base_config.py similarity index 100% rename from nerds/test/test_base_config.py rename to nerds_orig/test/test_base_config.py diff --git a/nerds/test/test_base_ner_class.py b/nerds_orig/test/test_base_ner_class.py similarity index 100% rename from nerds/test/test_base_ner_class.py rename to nerds_orig/test/test_base_ner_class.py diff --git a/nerds/test/test_bilstm_ner_model.py b/nerds_orig/test/test_bilstm_ner_model.py similarity index 100% rename from nerds/test/test_bilstm_ner_model.py rename to nerds_orig/test/test_bilstm_ner_model.py diff --git a/nerds/test/test_brat.py b/nerds_orig/test/test_brat.py similarity index 100% rename from nerds/test/test_brat.py rename to nerds_orig/test/test_brat.py diff --git a/nerds/test/test_crf_ner_model.py b/nerds_orig/test/test_crf_ner_model.py similarity index 100% rename from nerds/test/test_crf_ner_model.py rename to nerds_orig/test/test_crf_ner_model.py diff --git a/nerds/test/test_dictionary_ner_model.py b/nerds_orig/test/test_dictionary_ner_model.py similarity index 100% rename from nerds/test/test_dictionary_ner_model.py rename to nerds_orig/test/test_dictionary_ner_model.py diff --git a/nerds/test/test_document.py b/nerds_orig/test/test_document.py similarity index 100% rename from nerds/test/test_document.py rename to nerds_orig/test/test_document.py diff --git a/nerds/test/test_ensemble.py b/nerds_orig/test/test_ensemble.py similarity index 100% rename from nerds/test/test_ensemble.py rename to nerds_orig/test/test_ensemble.py diff --git a/nerds/test/test_ensemble_config.py b/nerds_orig/test/test_ensemble_config.py similarity index 100% rename from nerds/test/test_ensemble_config.py rename to nerds_orig/test/test_ensemble_config.py diff --git a/nerds/test/test_eval_scoring.py b/nerds_orig/test/test_eval_scoring.py similarity index 100% rename from nerds/test/test_eval_scoring.py rename to nerds_orig/test/test_eval_scoring.py diff --git a/nerds/test/test_kfold_cv.py b/nerds_orig/test/test_kfold_cv.py similarity index 100% rename from nerds/test/test_kfold_cv.py rename to nerds_orig/test/test_kfold_cv.py diff --git a/nerds/test/test_ner_model_optimizer.py b/nerds_orig/test/test_ner_model_optimizer.py similarity index 100% rename from nerds/test/test_ner_model_optimizer.py rename to nerds_orig/test/test_ner_model_optimizer.py diff --git a/nerds/test/test_pipeline.py b/nerds_orig/test/test_pipeline.py similarity index 100% rename from nerds/test/test_pipeline.py rename to nerds_orig/test/test_pipeline.py diff --git a/nerds/test/test_spacy_ner_model.py b/nerds_orig/test/test_spacy_ner_model.py similarity index 100% rename from nerds/test/test_spacy_ner_model.py rename to nerds_orig/test/test_spacy_ner_model.py diff --git a/nerds/test/test_util_convert.py b/nerds_orig/test/test_util_convert.py similarity index 100% rename from nerds/test/test_util_convert.py rename to nerds_orig/test/test_util_convert.py diff --git a/nerds/test/test_util_file.py b/nerds_orig/test/test_util_file.py similarity index 100% rename from nerds/test/test_util_file.py rename to nerds_orig/test/test_util_file.py diff --git a/nerds/test/test_util_nlp.py b/nerds_orig/test/test_util_nlp.py similarity index 100% rename from nerds/test/test_util_nlp.py rename to nerds_orig/test/test_util_nlp.py diff --git a/nerds/test/test_util_string.py b/nerds_orig/test/test_util_string.py similarity index 100% rename from nerds/test/test_util_string.py rename to nerds_orig/test/test_util_string.py diff --git a/nerds/util/__init__.py b/nerds_orig/util/__init__.py similarity index 100% rename from nerds/util/__init__.py rename to nerds_orig/util/__init__.py diff --git a/nerds/util/convert.py b/nerds_orig/util/convert.py similarity index 100% rename from nerds/util/convert.py rename to nerds_orig/util/convert.py diff --git a/nerds/util/file.py b/nerds_orig/util/file.py similarity index 100% rename from nerds/util/file.py rename to nerds_orig/util/file.py diff --git a/nerds/util/logging.py b/nerds_orig/util/logging.py similarity index 100% rename from nerds/util/logging.py rename to nerds_orig/util/logging.py diff --git a/nerds/util/nlp.py b/nerds_orig/util/nlp.py similarity index 100% rename from nerds/util/nlp.py rename to nerds_orig/util/nlp.py diff --git a/nerds/util/string.py b/nerds_orig/util/string.py similarity index 100% rename from nerds/util/string.py rename to nerds_orig/util/string.py From aa7b2571693aad9cddb05082a971f7d234785505 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 12 Nov 2019 13:09:43 -0800 Subject: [PATCH 08/64] adding ExactMatchMultiClassDictionaryNER entry to __init__.py --- nerds_orig/core/model/ner/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nerds_orig/core/model/ner/__init__.py b/nerds_orig/core/model/ner/__init__.py index a226244..84f794f 100644 --- a/nerds_orig/core/model/ner/__init__.py +++ b/nerds_orig/core/model/ner/__init__.py @@ -2,6 +2,7 @@ from nerds.core.model.ner.bilstm import BidirectionalLSTM from nerds.core.model.ner.crf import CRF from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER +from nerds.core.model.ner.dictionary import ExactMatchMultiClassDictionaryNER from nerds.core.model.ner.ensemble import NERModelEnsemble from nerds.core.model.ner.spacy import SpaCyStatisticalNER @@ -9,6 +10,7 @@ "BidirectionalLSTM", "CRF", "ExactMatchDictionaryNER", + "ExactMatchMultiClassDictionaryNER", "NERModel", "NERModelEnsemble", "SpaCyStatisticalNER" From 5f43d9d05daff5148e7cb04ced428e44de00d0e4 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 12 Nov 2019 13:10:51 -0800 Subject: [PATCH 09/64] streamlined nerds models and code with anago style data backbone --- nerds/__init__.py | 0 nerds/models/__init__.py | 15 ++ nerds/models/base.py | 42 ++++++ nerds/models/bilstm.py | 161 ++++++++++++++++++++ nerds/models/crf.py | 226 ++++++++++++++++++++++++++++ nerds/models/dictionary.py | 233 +++++++++++++++++++++++++++++ nerds/models/elmo.py | 174 ++++++++++++++++++++++ nerds/models/spacy.py | 238 ++++++++++++++++++++++++++++++ nerds/test/data/example.ents | 6 + nerds/test/data/example.iob | 38 +++++ nerds/test/test_base_ner.py | 27 ++++ nerds/test/test_bilstm_ner.py | 13 ++ nerds/test/test_crf_ner.py | 12 ++ nerds/test/test_dictionary_ner.py | 30 ++++ nerds/test/test_elmo_ner.py | 17 +++ nerds/test/test_spacy_ner.py | 12 ++ nerds/test/test_utils.py | 17 +++ nerds/utils.py | 48 ++++++ 18 files changed, 1309 insertions(+) create mode 100644 nerds/__init__.py create mode 100644 nerds/models/__init__.py create mode 100644 nerds/models/base.py create mode 100644 nerds/models/bilstm.py create mode 100644 nerds/models/crf.py create mode 100644 nerds/models/dictionary.py create mode 100644 nerds/models/elmo.py create mode 100644 nerds/models/spacy.py create mode 100644 nerds/test/data/example.ents create mode 100644 nerds/test/data/example.iob create mode 100644 nerds/test/test_base_ner.py create mode 100644 nerds/test/test_bilstm_ner.py create mode 100644 nerds/test/test_crf_ner.py create mode 100644 nerds/test/test_dictionary_ner.py create mode 100644 nerds/test/test_elmo_ner.py create mode 100644 nerds/test/test_spacy_ner.py create mode 100644 nerds/test/test_utils.py create mode 100644 nerds/utils.py diff --git a/nerds/__init__.py b/nerds/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py new file mode 100644 index 0000000..bc63642 --- /dev/null +++ b/nerds/models/__init__.py @@ -0,0 +1,15 @@ +from nerds.models.base import NERModel +from nerds.models.bilstm import BiLstmCrfNER +from nerds.models.crf import CrfNER +from nerds.models.spacy import SpacyNER +from nerds.models.dictionary import DictionaryNER +from nerds.models.elmo import ElmoNER + +__all__ = [ + "NERModel", + "DictionaryNER", + "CrfNER", + "SpacyNER", + "BiLstmCrfNER", + "ElmoNER" +] diff --git a/nerds/models/base.py b/nerds/models/base.py new file mode 100644 index 0000000..fdd2454 --- /dev/null +++ b/nerds/models/base.py @@ -0,0 +1,42 @@ +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score + +from nerds.utils import flatten_lol + +class NERModel(BaseEstimator, ClassifierMixin): + """ Provides a basic interface to train NER models and annotate documents. + + This is the core class responsible for training models that perform + named entity recognition, and retrieving named entities from documents. + """ + def __init__(self, entity_label=None): + self.entity_label = entity_label + self.key = "" # To be added in subclass. + + def fit(self, X, y): + """ Train the model using data (X) and labels (y). Return trained model. + """ + raise NotImplementedError() + + def predict(self, X): + """ Makes predictions using trained model on data (X) and returns them. + """ + raise NotImplementedError() + + def save(self, file_path): + """ Saves a model to the local disk, provided a file path. + Should be overridden. + """ + raise NotImplementedError() + + def load(self, file_path): + """ Loads a model saved locally. Should be overridden. """ + raise NotImplementedError() + + def score(self, X, y, sample_weights=None): + """ Returns score for the model based on predicting on (X, y). This + method is needed for GridSearch like operations. + """ + y_pred = self.predict(X) + return accuracy_score(flatten_lol(y), flatten_lol(y_pred)) + diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py new file mode 100644 index 0000000..2d6780b --- /dev/null +++ b/nerds/models/bilstm.py @@ -0,0 +1,161 @@ +from anago.models import BiLSTMCRF, save_model, load_model +from anago.preprocessing import IndexTransformer +from anago.trainer import Trainer +from anago.tagger import Tagger + +from keras.optimizers import Adam + +from nerds.models import NERModel +from nerds.utils import get_logger + +from sklearn.model_selection import train_test_split + +import os + +log = get_logger() + + +class BiLstmCrfNER(NERModel): + + def __init__(self, entity_label=None): + """ Build a Anago Bi-LSTM CRF model. + + Args: + entity_label: label for single entity NER, default None + """ + super().__init__(entity_label) + self.key = "anago_bilstmcrf" + # populated by fit() and load(), expected by save() and transform() + self.preprocessor = None + self.model = None + self.trainer = None + self.tagger = None + + + def fit(self, X, y, + word_embedding_dim=100, + char_embedding_dim=25, + word_lstm_size=100, + char_lstm_size=25, + fc_dim=100, + dropout=0.5, + embeddings=None, + use_char=True, + use_crf=True, + batch_size=16, + learning_rate=0.001, + num_epochs=10): + """ Trains the NER model. Input is list of AnnotatedDocuments. + + Args: + X list(list(str)): list of list of tokens + y list(list(str)): list of list of BIO tags + word_embedding_dim (int): word embedding dimensions. + char_embedding_dim (int): character embedding dimensions. + word_lstm_size (int): character LSTM feature extractor output dimensions. + char_lstm_size (int): word tagger LSTM output dimensions. + fc_dim (int): output fully-connected layer size. + dropout (float): dropout rate. + embeddings (numpy array): word embedding matrix. + use_char (boolean): add char feature. + use_crf (boolean): use crf as last layer. + batch_size training batch size. + learning_rate learning rate for Adam optimizer. + num_epochs number of epochs of training. + """ + log.info("Preprocessing dataset...") + self.preprocessor = IndexTransformer(use_char=use_char) + self.preprocessor.fit(X, y) + + log.info("Building model...") + self.model = BiLSTMCRF( + char_embedding_dim=char_embedding_dim, + word_embedding_dim=word_embedding_dim, + char_lstm_size=char_lstm_size, + word_lstm_size=word_lstm_size, + char_vocab_size=self.preprocessor.char_vocab_size, + word_vocab_size=self.preprocessor.word_vocab_size, + num_labels=self.preprocessor.label_size, + dropout=dropout, + use_char=use_char, + use_crf=use_crf) + self.model, loss = self.model.build() + optimizer = Adam(lr=learning_rate) + self.model.compile(loss=loss, optimizer=optimizer) + self.model.summary() + + log.info('Training the model...') + self.trainer = Trainer(self.model, preprocessor=self.preprocessor) + + x_train, x_valid, y_train, y_valid = train_test_split(X, y, + test_size=0.1, random_state=42) + self.trainer.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=batch_size, epochs=num_epochs) + + self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + + return self + + + def predict(self, X): + """ Predicts using the NER model. + + Args: + X list(list(str)): list of list of tokens. + Returns: + y list(list(str)): list of list of predicted BIO tags. + """ + if self.tagger is None: + raise ValueError("No tagger found, either run fit() to train or load() a trained model") + + log.info("Predicting from model...") + ypreds = [self.tagger.predict(" ".join(x)) for x in X] + return ypreds + + + def save(self, dirpath): + """ Saves model to local disk, given a dirpath + + Args: + dirpath (str): a directory where model artifacts will be saved. + Model saves a weights.h5 weights file, a params.json parameter + file, and a preprocessor.pkl preprocessor file. + """ + if self.model is None or self.preprocessor is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + save_model(self.model, weights_file, params_file) + self.preprocessor.save(preprocessor_file) + + + def load(self, dirpath): + """ Loads a trained model from local disk, given the dirpath + + Args: + dirpath (str): a directory where model artifacts are saved. + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + if not (os.path.exists(weights_file) or + os.path.exists(params_file) or + os.path.exists(preprocessor_file)): + raise ValueError("Model files may be corrupted, exiting") + + self.model = load_model(weights_file, params_file) + self.preprocessor = IndexTransformer.load(preprocessor_file) + self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + + return self + diff --git a/nerds/models/crf.py b/nerds/models/crf.py new file mode 100644 index 0000000..6803cc0 --- /dev/null +++ b/nerds/models/crf.py @@ -0,0 +1,226 @@ +from nerds.models import NERModel +from nerds.utils import get_logger + +from sklearn.externals import joblib + +import os +import sklearn_crfsuite +import spacy + +log = get_logger() + + +class CrfNER(NERModel): + + def __init__(self, entity_label=None): + """ Build a sklearn.crfsuite.CRF CRF model + + Args: + entity_label (str): label for single entity NER, default None + """ + super().__init__(entity_label) + self.key = "crfsuite_crf" + self.nlp = None + self.model = None + + + def fit(self, X, y, + is_featurized=False, + max_iterations=100, + c1=0.1, + c2=0.1): + """ Build feature vectors and train CRF model. Wrapper for + sklearn_crfsuite.CRF model. The underlying model takes many + parameters (for full list (and possible future enhancement), see + https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html#CRF) + + Args: + X (list(list(str))) or (list(list(dict(str, str)))): list of + sentences or features. Sentences are tokenized into list + of words, and features are a list of word features, each + word feature is a dictionary of name-value pairs. + y (list(list(str))): list of list of BIO tags. + is_featurized (bool, default False): if True, X is a list of list + of features, else X is a list of list of words. + max_iterations (int, default 100): maximum number of + iterations to run CRF training + c1 (float, default 0.1): L1 regularization coefficient. + c2 (float, default 0.1): L2 regularization coefficient. + """ + if not is_featurized: + log.info("Generating features for {:d} samples...".format(len(X))) + if self.nlp is None: + self.nlp = self._load_language_model() + features = [self._sent2features(sent, self.nlp) for sent in X] + + log.info("Building model...") + self.model = sklearn_crfsuite.CRF( + algorithm="lbfgs", + c1=c1, + c2=c2, + max_iterations=max_iterations, + all_possible_transitions=True, + verbose=True) + + log.info("Training model...") + self.model.fit(X if is_featurized else features, y) + + return self + + + def predict(self, X, is_featurized=False): + """ Predicts using trained CRF model. + + Args: + X (list(list(dict(str, str))) or list(list(str))): list + of sentences or features. + is_featurized (bool, default False): if True, X is a list + of list of features, else X is a list of list of tokens. + Returns: + y (list(list(str))): list of list of predicted BIO tags. + """ + if self.model is None: + raise ValueError("CRF model not found, run fit() to train or load() pre-trained model") + + if not is_featurized: + log.info("Generating features for {:d} samples".format(len(X))) + if self.nlp is None: + self.nlp = self._load_language_model() + features = [self._sent2features(sent, self.nlp) for sent in X] + + return self.model.predict(X if is_featurized else features) + + + def save(self, dirpath): + """ Save a trained CRF model at dirpath. + + Args: + dirpath (str): path to model directory. + """ + if self.model is None: + raise ValueError("No model to save, run fit() to train or load() pre-trained model") + + if not os.path.exists(MODEL_DIR): + os.makedirs(MODEL_DIR) + model_file = os.path.join(dirpath, "crf-model.pkl") + joblib.dump(self.model, model_file) + + + def load(self, dirpath): + """ Load a pre-trained CRF model from dirpath. + + Args: + dirpath (str): path to model directory. + Returns: + this object populated with pre-trained model. + """ + model_file = os.path.join(dirpath, "crf-model.pkl") + if not os.path.exists(model_file): + raise ValueError("No CRF model to load at {:s}, exiting.".format(model_file)) + + self.model = joblib.load(model_file) + return self + + + def _load_language_model(self): + return spacy.load("en") + + + def _sent2features(self, sent, nlp): + """ Converts a list of tokens to a list of features for CRF. + Each feature is a dictionary of feature name value pairs. + """ + doc = nlp(" ".join(sent)) + postags = [token.pos_ for token in doc] + features = [self._word2featdict(sent, postags, i) for i in range(len(sent))] + return features + + + def _word2featdict(self, sent, postags, pos): + """ Build up a default feature dictionary for each word in sentence. + The default considers a window size of 2 around each word, so it + includes word-1, word-2, word, word+1, word+2. For each word, we + consider: + - prefix and suffix of size 2 and 3 + - the word itself, lowercase + - is_upper, is_lower, begin with upper, is_digit + - POS tag, and POS tag prefix of size 2 + """ + # current word + word = sent[pos] + postag = postags[pos] + feat_dict = { + 'bias': 1.0, + 'word[-2]': word[-2:], + 'word[-3:]': word[-3:], + 'word.lower()': word.lower(), + 'word.isupper()': word.isupper(), + 'word.istitle()': word.istitle(), + 'word.isdigit()': word.isdigit(), + 'postag': postag, + 'postag[:2]': postag[0:2], + } + # word - 2 + if pos > 1: + prev_word2 = sent[pos - 2] + prev_postag2 = postags[pos - 2] + feat_dict.update({ + '-2:word[-2]': prev_word2[-2:], + '-2:word[-3]': prev_word2[-3:], + '-2:word.lower()': prev_word2.lower(), + '-2:word.istitle()': prev_word2.istitle(), + '-2:word.isupper()': prev_word2.isupper(), + '-2:word.isdigit()': prev_word2.isdigit(), + '-2:postag': prev_postag2, + '-2:postag[:2]': prev_postag2[:2], + }) + # word - 1 + if pos > 0: + prev_word = sent[pos - 1] + prev_postag = postags[pos - 1] + feat_dict.update({ + '-1:word[-2]': prev_word[-2:], + '-1:word[-3]': prev_word[-3:], + '-1:word.lower()': prev_word.lower(), + '-1:word.istitle()': prev_word.istitle(), + '-1:word.isupper()': prev_word.isupper(), + '-1:word.isdigit()': prev_word.isdigit(), + '-1:postag': prev_postag, + '-1:postag[:2]': prev_postag[:2], + }) + # first word + if pos == 0: + feat_dict['BOS'] = True + # word + 1 + if pos < len(sent) - 1: + next_word = sent[pos + 1] + next_postag = postags[pos + 1] + feat_dict.update({ + '+1:word[-2]': next_word[-2:], + '+1:word[-3]': next_word[-3:], + '+1:word.lower()': next_word.lower(), + '+1:word.istitle()': next_word.istitle(), + '+1:word.isupper()': next_word.isupper(), + '+1:word.isdigit()': next_word.isdigit(), + '+1:postag': next_postag, + '+1:postag[:2]': next_postag[:2], + }) + # word + 2 + if pos < len(sent) - 2: + next_word2 = sent[pos + 2] + next_postag2 = postags[pos + 2] + feat_dict.update({ + '+2:word[-2]': next_word2[-2:], + '+2:word[-3]': next_word2[-3:], + '+2:word.lower()': next_word2.lower(), + '+2:word.istitle()': next_word2.istitle(), + '+2:word.isupper()': next_word2.isupper(), + '+2:word.isdigit()': next_word2.isdigit(), + '+2:postag': next_postag2, + '+2:postag[:2]': next_postag2[:2], + }) + # last word + if pos == len(sent) - 1: + feat_dict['EOS'] = True + return feat_dict + diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py new file mode 100644 index 0000000..eeac00a --- /dev/null +++ b/nerds/models/dictionary.py @@ -0,0 +1,233 @@ +from nerds.models import NERModel +from nerds.utils import get_logger + +from sklearn.externals import joblib + +import ahocorasick +import os + +log = get_logger() + +class DictionaryNER(NERModel): + + def __init__(self, entity_label=None): + super().__init__(entity_label) + self.key = "aho-corasick-dict-ner" + self.model = None + + + def fit(self, X, y, + combine_tokens=True): + """ Build dictionary of phrases of different entity types. + + Args: + X (list(list(str))): list of list of tokens or phrases. + combine_tokens (bool, default True): if combine tokens + is True, then input is tokenized as individual words. + This would be the expected format if the input came + directly from a training set. + + X = [..., [..., "New", "York", "City", ...], ...] + y = [..., [..., "B-loc", "I-loc", "I-loc", ...], ...] + + If combine_tokens is False, then phrases have been + pre-chunked. This would be the expected format if the + input came from a third party dictionary. + + X = [..., [..., "New York City", ...], ...] + y = [..., [..., "loc", ...], ...] + + y (list(list(str))): list of list of labels. If combine_tokens + is True, then labels are IOB tags. If combine_tokens is False, + labels are entity types (without leading B and I), and without + any O labels. + + combine_tokens (bool, default True): if True, input comes from + standard training set, and an additional step to chunk + phrases is needed. If False, input comes from a dictionary + with phrase chunking already done. + """ + self.model = ahocorasick.Automaton() + + if combine_tokens: + for idx, (tokens, labels) in enumerate(zip(X, y)): + phrase_tokens, phrase_labels = self._combine_tokens(tokens, labels) + for phrase, label in zip(phrase_tokens, phrase_labels): + self.model.add_word(phrase, (label, phrase)) + else: + for token, label in zip(X, y): + self.model.add_word(token, (label, token)) + self.model.make_automaton() + + return self + + + def predict(self, X): + if self.model is None: + raise ValueError("No model found, use fit() to train or load() pretrained.") + + predictions = [] + for tokens in X: + sent = " ".join(tokens) + matched_phrases = [] + for end_index, (tag, phrase) in self.model.iter(sent): + start_index = end_index - len(phrase) + 1 + # filter out spurious matches on partial words + self._add_if_not_spurious_match( + start_index, end_index, tag, sent, matched_phrases) + # remove subsumed phrases + longest_phrases = self._remove_subsumed_matches(matched_phrases, 1) + # convert longest matches to IOB format + pred = self._convert_matches_to_iob_tags(tokens, longest_phrases) + predictions.append(pred) + + return predictions + + + def save(self, dirpath=None): + if self.model is None: + raise ValueError("No model found, use fit() to train or load() pretrained.") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + log.info("Saving model...") + model_file = os.path.join(dirpath, "dictionary-ner.pkl") + joblib.dump(self.model, model_file) + + + def load(self, dirpath=None): + model_file = os.path.join(dirpath, "dictionary-ner.pkl") + if not os.path.exists(model_file): + raise ValueError("Saved model {:s} not found.".format(model_file)) + + self.model = joblib.load(model_file) + return self + + + def _combine_tokens(self, tokens, labels): + """ Combine consecutive word tokens for some given entity type + to create phrase tokens. + + Args: + tokens (list(str)): a list of tokens representing a sentence. + labels (list(str)): a list of IOB tags for sentence. + + Returns: + phrases (list(str)): list of multi-word phrases. + phrase_labels (list(str)): list of phrase entity types. + """ + phrases, phrase_labels = [], [] + phrase_tokens = [] + for token, label in zip(tokens, labels): + if label == "O" and len(phrase_tokens) > 0: + phrases.append(" ".join(phrase_tokens)) + phrase_labels.append(prev_label.split("-")[-1]) + phrase_tokens = [] + if label.startswith("B-"): + phrase_tokens = [token] + if label.startswith("I-"): + phrase_tokens.append(token) + prev_label = label + + if len(phrase_tokens) > 0: + phrases.append(" ".join(phrase_tokens)) + phrase_labels.append(prev_label.split("-")[-1]) + + return phrases, phrase_labels + + + def _add_if_not_spurious_match(self, start_index, end_index, tag, + sentence, matched_phrases): + """ Aho-Corasick can match across word boundaries, and often matches + parts of longer words. This function checks to make sure any + matches it reports don't do so. + + Args: + start_index (int): reported start index of matched phrase. + end_index (int): reported end index of matched phrase. + tag (str): the entity type. + sentence (str): the sentence in which match occurs. + matched_phrases (list(str)): list of matched phrases, updated + in place by function. + """ + if start_index == 0: + if end_index < len(sentence): + if sentence[end_index + 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + elif end_index + 1 == len(sentence): + if start_index > 0: + if sentence[start_index - 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + else: + if sentence[start_index - 1] == " " and sentence[end_index + 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + + + def _remove_subsumed_matches(self, matched_phrases, k): + """ Remove matches that are subsumed in longer matches. This ensures + that the matches reported are the longest ones. Function works as + follows -- we sort the list by longest phrase first, and then check + to see if any shorter phrases are contained within the longest one + and remove them if so. We then recursively apply this same function + to the remaining list, moving one position down for the longest + phrase to match against. Function stops when we have seen all the + phrases. + + Args: + matched_phrases (list((start, end, iob_tag))): list of + matched phrase tuples. + k (int): starting position. + + Returns: + matched_phrases: without the shorter subsumed phrase tuples. + """ + if k >= len(matched_phrases): + return matched_phrases + sorted_matches = sorted(matched_phrases, key=lambda x: x[1]-x[0], reverse=True) + longest_matches = sorted_matches[0:k] + ref_offsets = (longest_matches[-1][0], longest_matches[-1][1]) + for phrase in sorted_matches[k:]: + if phrase[0] >= ref_offsets[0] and phrase[1] <= ref_offsets[1]: + continue + else: + longest_matches.append(phrase) + return self._remove_subsumed_matches(longest_matches, k+1) + + + def _convert_matches_to_iob_tags(self, tokens, matched_phrases): + """ Merges the longest matches with the original tokens to + produce a list of IOB tags for the sentence. + + Args: + tokens (list(str)): list of tokens for the sentence. + matched_phrase (list((start, end, tag))): list of longest + matched phrase tuples. + + Returns: + iob_tags (list(str)): list of IOB tags, each tag + corresponds to a word token. + """ + iob_tags = [] + curr_offset = 0 + prev_label = "O" + for token in tokens: + start_offset = curr_offset + end_offset = start_offset + len(token) + token_matched = False + matched_label = None + for phrase_start, phrase_end, phrase_label in matched_phrases: + if start_offset >= phrase_start and end_offset <= phrase_end: + token_matched = True + matched_label = phrase_label + break + if token_matched: + iob_tags.append( + "I-" + phrase_label if prev_label == phrase_label + else "B-" + phrase_label) + prev_label = phrase_label + else: + iob_tags.append("O") + prev_label = "O" + curr_offset = end_offset + 1 + return iob_tags diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py new file mode 100644 index 0000000..22deeef --- /dev/null +++ b/nerds/models/elmo.py @@ -0,0 +1,174 @@ +from anago.utils import load_data_and_labels, load_glove, filter_embeddings +from anago.models import ELModel, save_model, load_model +from anago.preprocessing import ELMoTransformer +from anago.trainer import Trainer +from anago.tagger import Tagger + +from keras.optimizers import Adam + +from nerds.models import NERModel +from nerds.utils import get_logger + +from sklearn.model_selection import train_test_split + +import os + +log = get_logger() + + +class ElmoNER(NERModel): + + def __init__(self, entity_label=None): + """ Build a Anago Bi-LSTM CRF model. + + Args: + entity_label: label for single entity NER, default None + """ + super().__init__(entity_label) + self.key = "anago_elmo" + # populated by fit() and load(), expected by save() and transform() + self.preprocessor = None + self.model = None + self.trainer = None + self.tagger = None + + + def fit(self, X, y, + word_embedding_dim=100, + char_embedding_dim=25, + word_lstm_size=100, + char_lstm_size=25, + fc_dim=100, + dropout=0.5, + embeddings=None, + embeddings_file=None, + embeddings_dim=None, + batch_size=16, + learning_rate=0.001, + num_epochs=10): + """ Trains the NER model. Input is list of AnnotatedDocuments. + + Args: + X list(list(str)): list of list of tokens + y list(list(str)): list of list of BIO tags + word_embedding_dim (int): word embedding dimensions. + char_embedding_dim (int): character embedding dimensions. + word_lstm_size (int): character LSTM feature extractor output dimensions. + char_lstm_size (int): word tagger LSTM output dimensions. + fc_dim (int): output fully-connected layer size. + dropout (float): dropout rate. + embeddings (numpy array): word embedding matrix. + embeddings_file (str): path to embedding file. + embeddings_dim (int): size of embedding vector. + use_char (boolean): add char feature. + use_crf (boolean): use crf as last layer. + batch_size training batch size. + learning_rate learning rate for Adam optimizer. + num_epochs number of epochs of training. + """ + if embeddings is None and (embeddings_file is None or embeddings_dim is None): + raise ValueError("Either embeddings should be provided, or both embeddings_file and embeddings_dim should be provided, exiting.") + + log.info("Preprocessing dataset...") + self.preprocessor = ELMoTransformer() + self.preprocessor.fit(X, y) + + if embeddings is None: + embeddings = load_glove(embeddings_file) + embeddings = filter_embeddings(embeddings, + self.preprocessor._word_vocab.vocab, + embeddings_dim) + + log.info("Building model...") + self.model = ELModel( + char_embedding_dim=char_embedding_dim, + word_embedding_dim=word_embedding_dim, + char_lstm_size=char_lstm_size, + word_lstm_size=word_lstm_size, + char_vocab_size=self.preprocessor.char_vocab_size, + word_vocab_size=self.preprocessor.word_vocab_size, + num_labels=self.preprocessor.label_size, + embeddings=embeddings, + dropout=dropout) + + self.model, loss = self.model.build() + optimizer = Adam(lr=learning_rate) + self.model.compile(loss=loss, optimizer=optimizer) + self.model.summary() + + log.info('Training the model...') + self.trainer = Trainer(self.model, preprocessor=self.preprocessor) + + x_train, x_valid, y_train, y_valid = train_test_split(X, y, + test_size=0.1, random_state=42) + self.trainer.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=batch_size, epochs=num_epochs) + + self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + + return self + + + def predict(self, X): + """ Predicts using the NER model. + + Args: + X list(list(str)): list of list of tokens. + Returns: + y list(list(str)): list of list of predicted BIO tags. + """ + if self.tagger is None: + raise ValueError("No tagger found, either run fit() to train or load() a trained model") + + log.info("Predicting from model...") + ypreds = [self.tagger.predict(" ".join(x)) for x in X] + return ypreds + + + def save(self, dirpath): + """ Saves model to local disk, given a dirpath + + Args: + dirpath (str): a directory where model artifacts will be saved. + Model saves a weights.h5 weights file, a params.json parameter + file, and a preprocessor.pkl preprocessor file. + """ + if self.model is None or self.preprocessor is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + save_model(self.model, weights_file, params_file) + self.preprocessor.save(preprocessor_file) + + + def load(self, dirpath): + """ Loads a trained model from local disk, given the dirpath + + Args: + dirpath (str): a directory where model artifacts are saved. + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + if not (os.path.exists(weights_file) or + os.path.exists(params_file) or + os.path.exists(preprocessor_file)): + raise ValueError("Model files may be corrupted, exiting") + + self.model = load_model(weights_file, params_file) + self.preprocessor = ELMoTransformer.load(preprocessor_file) + self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + + return self + + diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py new file mode 100644 index 0000000..0694a93 --- /dev/null +++ b/nerds/models/spacy.py @@ -0,0 +1,238 @@ +from nerds.models import NERModel +from nerds.utils import get_logger + +from spacy.util import minibatch + +import itertools +import os +import random +import spacy + +log = get_logger() + + +class SpacyNER(NERModel): + + def __init__(self, entity_label=None): + """ Build a SpaCy EntityRecognizer NER model. + + Args: + entity_label (str, default None): entity label for single class NER. + """ + super().__init__(entity_label) + self.key = "spacy_ner" + self.model = None + + + def fit(self, X, y, + num_epochs=20, + dropout=0.1, + batch_size=32): + """ Trains the SpaCy NER model. + + Args: + X (list(list(str))): list of tokenized sentences, or list of list + of tokens. + y (list(list(str))): list of list of IOB tags. + num_epochs (int): number of epochs of training. + dropout (float): rate of dropout during training between 0 and 1. + batch_size (int): batch size to use during training + """ + log.info("Reformatting data to SpaCy format...") + features = [self._convert_to_spacy(tokens, labels) + for tokens, labels in zip(X, y)] + + log.info("Building SpaCy NER model...") + self.model = spacy.blank("en") + if "ner" not in self.model.pipe_names: + ner = self.model.create_pipe("ner") + self.model.add_pipe(ner) + else: + ner = self.model.get_pipe("ner") + + unique_labels = set() + for _, annotations in features: + for ent in annotations.get("entities"): + unique_labels.add(ent[2]) + ner.add_label(ent[2]) + + for label in list(unique_labels): + ner.add_label("B-" + label) + ner.add_label("I-" + label) + ner.add_label("O") + + log.info("Training SpaCy NER model...") + optimizer = self.model.begin_training() + + other_pipes = [p for p in self.model.pipe_names if p != "ner"] + with self.model.disable_pipes(*other_pipes): + for it in range(num_epochs): + random.shuffle(features) + losses = {} + batches = minibatch(features, size=batch_size) + for batch in batches: + texts, annotations = zip(*batch) + self.model.update(texts, annotations, + sgd=optimizer, + drop=dropout, + losses=losses) + loss_value = losses["ner"] + log.info("Epoch: {:d} loss: {:.5f}".format(it, loss_value)) + + return self + + + def predict(self, X): + """ Predicts using trained SpaCy NER model. + + Args: + X (list(list(str))): list of tokenized sentences. + is_featurized (bool, default False): if True, X is a list + of list of features, else X is a list of list of tokens. + Returns: + y (list(list(str))): list of list of predicted BIO tags. + """ + if self.model is None: + raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") + + log.info("Generating predictions...") + preds = [] + for sent_tokens in X: + sent = " ".join(sent_tokens) + doc = self.model(sent) + entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + sent_preds = self._convert_from_spacy(sent, entities) + preds.append(sent_preds) + + return preds + + + def save(self, dirpath): + """ Save trained SpaCy NER model at dirpath. + + Args: + dirpath (str): path to model directory. + """ + if self.model is None: + raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model") + + log.info("Saving model...") + if not os.path.exists(dirpath): + os.makedirs(dirpath) + self.model.to_disk(dirpath) + + + def load(self, dirpath): + """ Load a pre-trained SpaCy NER model from dirpath. + + Args: + dirpath (str): path to model directory. + Returns: + this object populated with pre-trained model. + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory {:s} not found".format(dirpath)) + + log.info("Loading model...") + self.model = spacy.load(MODEL_DIR) + return self + + + def _convert_to_spacy(self, tokens, labels): + """ Convert data and labels for single sentence to SpaCy specific format: + + Args: + tokens (list(str)): list of tokens. + labels (list(str)): list of IOB tags. + + Returns: + list of tuples in SpaCy format as shown below: + ( + "The quick brown fox jumps over the lazy dog", + { + "entities": [ + (16, 19, "ANIMAL"), + (40, 43, "ANIMAL") + ] + } + ) + """ + content = " ".join(tokens) + offsets, current_offset = [], 0 + for token, label in zip(tokens, labels): + start_offset = current_offset + end_offset = start_offset + len(token) + if label != "O": + offsets.append((start_offset, end_offset, label.split("-")[-1])) + current_offset = end_offset + 1 # skip space + return (content, {"entities": offsets}) + + + def _entities2dict(self, entities): + """ Convert entities returned from SpaCy into a dictionary keyed by + offset pair. This allows predicted entities to be looked up quickly + and the appropriate tag populated in _convert_from_spacy(). + + Args: + entities (list((begin, end, label))): list of label predictions. + + Return: + entity_dict (dict{(begin, end): label}) + """ + entity_dict = {} + for begin, end, label in entities: + key = ":".join([str(begin), str(end)]) + entity_dict[key] = label + return entity_dict + + + def _tokenize_with_offsets(self, sent): + """ Tokenize a sentence by space, and write out tuples that include + offsets for each token. + + Args: + sent (str): sentence as a string. + + Returns: + offsets (list((start, end), token)): list of tokens with + offset information. + """ + tokens = sent.split() + offsets, curr_offset = [], 0 + for token in tokens: + begin = curr_offset + end = begin + len(token) + key = ":".join([str(begin), str(end)]) + offsets.append((key, token)) + curr_offset = end + 1 + return offsets + + + def _convert_from_spacy(self, sent, entities): + """ Converts SpaCy predictions to standard form. + + Args: + sent (str): the sentence as a string. + entities (list(entities)): a list of SpaCy Entity objects + Entity(start_char, end_char, label_). + + Returns: + predictions (list(str)): a list of IOB tags for a single + sentence. + """ + iob_tags, prev_tag = [], None + entity_dict = self._entities2dict(entities) + for offset_token in self._tokenize_with_offsets(sent): + offset, token = offset_token + if offset in entity_dict: + curr_tag = entity_dict[offset] + if prev_tag is None or prev_tag != curr_tag: + iob_tags.append("B-" + curr_tag) + else: + iob_tags.append("I-" + curr_tag) + else: + curr_tag = "O" + iob_tags.append("O") + prev_tag = curr_tag + return iob_tags + diff --git a/nerds/test/data/example.ents b/nerds/test/data/example.ents new file mode 100644 index 0000000..b836ed9 --- /dev/null +++ b/nerds/test/data/example.ents @@ -0,0 +1,6 @@ +Pierre Vinken PER +Mr . Vinken PER +Elsevier N . V . ORG +61 years old DATE +Nov . 29 DATE +Dutch NORP diff --git a/nerds/test/data/example.iob b/nerds/test/data/example.iob new file mode 100644 index 0000000..0dea1e6 --- /dev/null +++ b/nerds/test/data/example.iob @@ -0,0 +1,38 @@ +Pierre B-PER +Vinken I-PER +, O +61 B-DATE +years I-DATE +old I-DATE +, O +will O +join O +the O +board O +as O +a O +nonexecutive O +director O +Nov B-DATE +. I-DATE +29 I-DATE +. O + +Mr B-PER +. I-PER +Vinken I-PER +is O +chairman O +of O +Elsevier B-ORG +N I-ORG +. I-ORG +V I-ORG +. I-ORG +, O +the O +Dutch B-NORP +publishing O +group O +. O + diff --git a/nerds/test/test_base_ner.py b/nerds/test/test_base_ner.py new file mode 100644 index 0000000..6e06cff --- /dev/null +++ b/nerds/test/test_base_ner.py @@ -0,0 +1,27 @@ +from nose.tools import assert_equal, assert_raises + +from nerds.models import NERModel + +def test_fit(): + model = NERModel() + assert_raises(NotImplementedError, model.fit, [], []) + + +def test_predict(): + model = NERModel() + assert_raises(NotImplementedError, model.predict, []) + + +def test_load(): + model = NERModel() + assert_raises(NotImplementedError, model.load, "") + + +def test_save(): + model = NERModel() + assert_raises(NotImplementedError, model.save, "") + + +def test_score(): + model = NERModel() + assert_raises(NotImplementedError, model.score, [], []) diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py new file mode 100644 index 0000000..55f17c4 --- /dev/null +++ b/nerds/test/test_bilstm_ner.py @@ -0,0 +1,13 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import BiLstmCrfNER +from nerds.utils import load_data_and_labels + +def test_crf_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = BiLstmCrfNER() + model.fit(X, y, num_epochs=1) + y_pred = model.predict(X) + # there is not enough data to train this model properly, so decent + # asserts are unlikely to succeed. + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") diff --git a/nerds/test/test_crf_ner.py b/nerds/test/test_crf_ner.py new file mode 100644 index 0000000..73d239e --- /dev/null +++ b/nerds/test/test_crf_ner.py @@ -0,0 +1,12 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import CrfNER +from nerds.utils import load_data_and_labels + +def test_crf_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = CrfNER() + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) diff --git a/nerds/test/test_dictionary_ner.py b/nerds/test/test_dictionary_ner.py new file mode 100644 index 0000000..d83e491 --- /dev/null +++ b/nerds/test/test_dictionary_ner.py @@ -0,0 +1,30 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import DictionaryNER +from nerds.utils import load_data_and_labels + +def test_dictionary_ner_from_conll(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = DictionaryNER() + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) + + +def test_dictionary_ner_from_dict(): + # load and fit model from dictionary + xs, ys = [], [] + fdict = open("nerds/test/data/example.ents", "r") + for line in fdict: + x, y = line.strip().split('\t') + xs.append(x) + ys.append(y) + fdict.close() + model = DictionaryNER() + model.fit(xs, ys, combine_tokens=False) + # predict using example + X, y = load_data_and_labels("nerds/test/data/example.iob") + y_pred = model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py new file mode 100644 index 0000000..a99eed4 --- /dev/null +++ b/nerds/test/test_elmo_ner.py @@ -0,0 +1,17 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import ElmoNER +from nerds.utils import load_data_and_labels + +import numpy as np + +def test_crf_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = ElmoNER() + # there are 28 unique words in our "vocabulary" + embeddings = np.random.random((28, 100)) + model.fit(X, y, embeddings=embeddings, num_epochs=1) + y_pred = model.predict(X) + # there is not enough data to train this model properly, so decent + # asserts are unlikely to succeed. + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py new file mode 100644 index 0000000..64bc76a --- /dev/null +++ b/nerds/test/test_spacy_ner.py @@ -0,0 +1,12 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import SpacyNER +from nerds.utils import load_data_and_labels + +def test_crf_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = SpacyNER() + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py new file mode 100644 index 0000000..aa96016 --- /dev/null +++ b/nerds/test/test_utils.py @@ -0,0 +1,17 @@ +from nose.tools import assert_equal, assert_true + +from nerds.utils import * + + +def test_load_data_and_labels(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + assert_true(len(X) == 2, "There should be 2 sentences in X") + assert_equal(len(X), len(y), "There should be tags for 2 sentences in y") + assert_equal(len(X[0]), len(y[0]), "Number of tokens should be equal to number of tags") + + +def test_flatten_lol(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + yflat = flatten_lol(y, strip_prefix=True) + assert_equal(36, len(yflat), "There should be 36 tags in all") + assert_equal(5, len([y for y in yflat if y == "PER"]), "There should be 5 PER tags") diff --git a/nerds/utils.py b/nerds/utils.py new file mode 100644 index 0000000..652b9e7 --- /dev/null +++ b/nerds/utils.py @@ -0,0 +1,48 @@ +import anago +import itertools +import logging + + +def get_logger(log_level="DEBUG"): + # TODO: The log level should be adjusted by some kind of configuration + # file, e.g. the dev build should have DEBUG, while the release build + # should have "WARN" or higher. + f = "%(levelname)s %(asctime)s %(module)s %(filename)s: %(message)s" + logging.basicConfig(format=f) + logger = logging.getLogger(__name__) + logger.setLevel(log_level) + return logger + + +def load_data_and_labels(filepath): + """ Wrapper to expose anago's load_data_and_labels. Built here as + a wrapper because users of non-neural models are not expected + to be familiar with Anago. + + Args: + filepath (str): path to the file in IOB format to be loaded. + + Returns: + x (list(str)): list of tokens. + y (list(str)): list of tags. + """ + return anago.utils.load_data_and_labels(filepath) + + +def flatten_lol(xs, strip_prefix=True): + """ Flatten label or predictions from list(list(str)) to list(str). + Flattened list can be input to scikit-learn's standard functions + to compute various metrics. + + Args: + xs (list(list(str))): list of list of tags (inner list is sentence). + strip_prefix (bool): if True, remove leading I- and B-, else retain. + """ + def strip_iob_prefix(label): + return label.split('-')[-1] + if strip_prefix: + return [strip_iob_prefix(x) for x in itertools.chain.from_iterable(xs)] + else: + return [x for x in itertools.chain.from_iterable(xs)] + + From fe0de327493cf52b20d531b7b12a473751028e1d Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 13 Nov 2019 15:53:42 -0800 Subject: [PATCH 10/64] Initial commit for Ensemble NER (maximal hard voting) with weights plus test. --- Makefile | 2 +- nerds/models/__init__.py | 4 +- nerds/models/ensemble.py | 158 ++++++++++++++++++++++++++++++++ nerds/test/test_ensemble_ner.py | 18 ++++ 4 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 nerds/models/ensemble.py create mode 100644 nerds/test/test_ensemble_ner.py diff --git a/Makefile b/Makefile index 6ac10e1..b1a6fb0 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ dist: test: make clean_test - nosetests --with-coverage --cover-html -s --verbosity=2 --cover-package=nerds + nosetests --with-coverage --cover-html -s --verbosity=2 --cover-package=nerds nerds/test/ lint: flake8 --ignore=W605,W504 --verbose nerds diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py index bc63642..da69407 100644 --- a/nerds/models/__init__.py +++ b/nerds/models/__init__.py @@ -4,6 +4,7 @@ from nerds.models.spacy import SpacyNER from nerds.models.dictionary import DictionaryNER from nerds.models.elmo import ElmoNER +from nerds.models.ensemble import EnsembleNER __all__ = [ "NERModel", @@ -11,5 +12,6 @@ "CrfNER", "SpacyNER", "BiLstmCrfNER", - "ElmoNER" + "ElmoNER", + "EnsembleNER" ] diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py new file mode 100644 index 0000000..0ae6b8f --- /dev/null +++ b/nerds/models/ensemble.py @@ -0,0 +1,158 @@ +from nerds.models import NERModel +from nerds.utils import get_logger + +import joblib +import numpy as np +import os + +log = get_logger() + +class EnsembleNER(NERModel): + + def __init__(self, entity_label=None): + super().__init__(entity_label) + self.key = "voting_ensemble" + # these are set by fit and load, required by predict and save + self.estimators = None + self.weights = None + + + def fit(self, X, y, + estimators=[], + weights=None, + is_pretrained=False): + """ Train ensemble by training underlying NERModels. + + Args: + X (list(list(str))): list of list of tokens. + y (list(list(str))): list of list of BIO tags. + estimators (list(NERModel, dict(str,obj)), default empty): list + of (NERModels, fit_param) pairs to use in the ensemble. The + fit_param is a flat dictionary of named arguments used in + fit() for the particular NERModel. + weights (list(int), default None): sequence of weights to + apply to predicted class labels from each estimator. If + None, then predictions from all estimators are treated + equally. + """ + if estimators is None or len(estimators) == 0: + raise ValueError("Non-empty list of estimators required to fit ensemble.") + else: + self.estimators = estimators + if weights is None: + self.weights = [1] * len(estimators) + else: + if len(estimators) != len(weights): + raise ValueError("Number of weights must correspond to number of estimators.") + else: + self.weights = weights + + if is_pretrained: + return self + + # various pickling errors are seen if we use joblib.Parallel to fit + # in parallel across multiple processors. Since normal usage should + # not involve calling fit(), this is okay to keep as sequential. + fitted_estimators = [self._fit_estimator(clf, X, y, fit_params) + for clf, fit_params in self.estimators] + + self.estimators = [(fitted, params) for (clf, params), fitted in + zip(self.estimators, fitted_estimators)] + + return self + + + def predict(self, X): + """ + Predicts using each estimator in the ensemble, then merges the + predictions using a voting scheme given by the vote() method + (subclasses can override voting policy by overriding vote()). + + Args: + X (list(list(str))): list of list of tokens to predict from. + + Returns: + ypred (list(list(str))): list of list of BIO tags. + """ + if self.estimators is None or self.weights is None: + raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") + + predictions = [] + for clf, _ in self.estimators: + predictions.append(clf.predict(X)) + + return self._vote(predictions) + + + def load(model_dirpath): + raise NotImplementedError() + + + def save(model_dirpath): + raise NotImplementedError() + + + def _fit_estimator(self, estimator, X, y, fit_params): + fitted_estimator = estimator.fit(X, y, **fit_params) + return fitted_estimator + + + def _predict_estimator(self, estimator, X): + return estimator.predict(X) + + + def _vote(self, predictions): + """ + Voting mechanism (can be overriden by subclass if desired). + + Args: + predictions (list(list(list(str)))): a list of list of list of BIO + tags predicted by each NER in the ensemble. Each NER outputs + a list of list of BIO tags where the outer list corresponds + to sentences and the inner list corresponds to tokens. + + Returns: + voted_predictions (list(list(str))): a list of list of BIO tags. + Each BIO tag represents the most frequent tag + """ + tag2int, int2tag = self._build_label_vocab(predictions) + + best_preds = [] + for row_id in range(len(predictions[0])): + + row_preds = [] + # gather all predictions for this row + for est_id in range(len(predictions)): + sent_pred = np.array([tag2int[y] for y in predictions[est_id][row_id]]) + # weighted by weights if any + for weight in range(self.weights[est_id]): + row_preds.append(sent_pred) + + # convert to numpy matrix for performance + R = np.array(row_preds) + + # we now find the most frequent tag at each position + B = np.zeros((R.shape[1]), dtype="int32") + for col_id in range(R.shape[1]): + col = R[:, col_id] + values, indices = np.unique(col, return_inverse=True) + B[col_id] = values[np.argmax(np.bincount(indices))] + + # append the labels associated with the most frequent tags + best_preds.append([int2tag[x] for x in B.tolist()]) + + return best_preds + + + def _build_label_vocab(self, predictions): + """ build lookup table from token to int and back (for performance) """ + tag2int, int2tag = {}, {} + tok_int = 1 + for est_pred in predictions: + for sent_pred in est_pred: + for tok_pred in sent_pred: + if tok_pred not in tag2int.keys(): + tag2int[tok_pred] = tok_int + tok_int += 1 + int2tag = {v:k for k, v in tag2int.items()} + return tag2int, int2tag diff --git a/nerds/test/test_ensemble_ner.py b/nerds/test/test_ensemble_ner.py new file mode 100644 index 0000000..19eca34 --- /dev/null +++ b/nerds/test/test_ensemble_ner.py @@ -0,0 +1,18 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import DictionaryNER, CrfNER, SpacyNER, EnsembleNER +from nerds.utils import load_data_and_labels + +def test_ensemble_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + estimators = [ + (DictionaryNER(), {}), + (CrfNER(), {"max_iterations": 1}), + (SpacyNER(), {"num_epochs": 1}) + ] + model = EnsembleNER() + model.fit(X, y, estimators=estimators) + y_pred = model.predict(X) + assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") + assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") + From 08c3ef3283e0d5a94f0bf2efc3c5e81626207dc7 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 13 Nov 2019 15:58:51 -0800 Subject: [PATCH 11/64] replacing mention of IOB with BIO to reduce confusion. --- nerds/models/spacy.py | 16 ++++++++-------- nerds/utils.py | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 0694a93..8cd8a39 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -33,7 +33,7 @@ def fit(self, X, y, Args: X (list(list(str))): list of tokenized sentences, or list of list of tokens. - y (list(list(str))): list of list of IOB tags. + y (list(list(str))): list of list of BIO tags. num_epochs (int): number of epochs of training. dropout (float): rate of dropout during training between 0 and 1. batch_size (int): batch size to use during training @@ -143,7 +143,7 @@ def _convert_to_spacy(self, tokens, labels): Args: tokens (list(str)): list of tokens. - labels (list(str)): list of IOB tags. + labels (list(str)): list of BIO tags. Returns: list of tuples in SpaCy format as shown below: @@ -217,22 +217,22 @@ def _convert_from_spacy(self, sent, entities): Entity(start_char, end_char, label_). Returns: - predictions (list(str)): a list of IOB tags for a single + predictions (list(str)): a list of BIO tags for a single sentence. """ - iob_tags, prev_tag = [], None + bio_tags, prev_tag = [], None entity_dict = self._entities2dict(entities) for offset_token in self._tokenize_with_offsets(sent): offset, token = offset_token if offset in entity_dict: curr_tag = entity_dict[offset] if prev_tag is None or prev_tag != curr_tag: - iob_tags.append("B-" + curr_tag) + bio_tags.append("B-" + curr_tag) else: - iob_tags.append("I-" + curr_tag) + bio_tags.append("I-" + curr_tag) else: curr_tag = "O" - iob_tags.append("O") + bio_tags.append("O") prev_tag = curr_tag - return iob_tags + return bio_tags diff --git a/nerds/utils.py b/nerds/utils.py index 652b9e7..0ccd658 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -20,7 +20,7 @@ def load_data_and_labels(filepath): to be familiar with Anago. Args: - filepath (str): path to the file in IOB format to be loaded. + filepath (str): path to the file in BIO format to be loaded. Returns: x (list(str)): list of tokens. @@ -38,10 +38,10 @@ def flatten_lol(xs, strip_prefix=True): xs (list(list(str))): list of list of tags (inner list is sentence). strip_prefix (bool): if True, remove leading I- and B-, else retain. """ - def strip_iob_prefix(label): + def strip_bio_prefix(label): return label.split('-')[-1] if strip_prefix: - return [strip_iob_prefix(x) for x in itertools.chain.from_iterable(xs)] + return [strip_bio_prefix(x) for x in itertools.chain.from_iterable(xs)] else: return [x for x in itertools.chain.from_iterable(xs)] From 0d352f80f43031f5a6530f5f4ed763b329a1ad3d Mon Sep 17 00:00:00 2001 From: sujitpal Date: Thu, 14 Nov 2019 10:10:37 -0800 Subject: [PATCH 12/64] Replaced deprecated references to sklearn.externals.joblib with joblib per deprecation warning, added load/save tests to CRF and Dictionary NER models. --- nerds/models/crf.py | 8 ++++---- nerds/models/dictionary.py | 3 +-- nerds/test/test_crf_ner.py | 9 +++++++-- nerds/test/test_dictionary_ner.py | 9 +++++++-- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/nerds/models/crf.py b/nerds/models/crf.py index 6803cc0..2550345 100644 --- a/nerds/models/crf.py +++ b/nerds/models/crf.py @@ -1,9 +1,8 @@ from nerds.models import NERModel from nerds.utils import get_logger -from sklearn.externals import joblib - import os +import joblib import sklearn_crfsuite import spacy @@ -100,8 +99,9 @@ def save(self, dirpath): if self.model is None: raise ValueError("No model to save, run fit() to train or load() pre-trained model") - if not os.path.exists(MODEL_DIR): - os.makedirs(MODEL_DIR) + if not os.path.exists(dirpath): + os.makedirs(dirpath) + model_file = os.path.join(dirpath, "crf-model.pkl") joblib.dump(self.model, model_file) diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index eeac00a..2604781 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -1,9 +1,8 @@ from nerds.models import NERModel from nerds.utils import get_logger -from sklearn.externals import joblib - import ahocorasick +import joblib import os log = get_logger() diff --git a/nerds/test/test_crf_ner.py b/nerds/test/test_crf_ner.py index 73d239e..1ae1686 100644 --- a/nerds/test/test_crf_ner.py +++ b/nerds/test/test_crf_ner.py @@ -3,10 +3,15 @@ from nerds.models import CrfNER from nerds.utils import load_data_and_labels +import shutil + def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = CrfNER() model.fit(X, y) - y_pred = model.predict(X) + model.save("nerds/test/data/models") + r_model = model.load("nerds/test/data/models") + y_pred = r_model.predict(X) assert_equal(y, y_pred, "Label and prediction must be equal") - assert_equal(1.0, model.score(X, y)) + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") diff --git a/nerds/test/test_dictionary_ner.py b/nerds/test/test_dictionary_ner.py index d83e491..a24b238 100644 --- a/nerds/test/test_dictionary_ner.py +++ b/nerds/test/test_dictionary_ner.py @@ -3,13 +3,18 @@ from nerds.models import DictionaryNER from nerds.utils import load_data_and_labels +import shutil + def test_dictionary_ner_from_conll(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = DictionaryNER() model.fit(X, y) - y_pred = model.predict(X) + model.save("nerds/test/data/models") + r_model = model.load("nerds/test/data/models") + y_pred = r_model.predict(X) assert_equal(y, y_pred, "Label and prediction must be equal") - assert_equal(1.0, model.score(X, y)) + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") def test_dictionary_ner_from_dict(): From f12f915a817a13074588cb76600ac743519dd7ae Mon Sep 17 00:00:00 2001 From: sujitpal Date: Thu, 14 Nov 2019 10:49:01 -0800 Subject: [PATCH 13/64] Adding load and save for all NER models to tests. --- nerds/models/spacy.py | 2 +- nerds/test/test_bilstm_ner.py | 8 +++++++- nerds/test/test_elmo_ner.py | 6 +++++- nerds/test/test_spacy_ner.py | 9 +++++++-- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 8cd8a39..9db2131 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -134,7 +134,7 @@ def load(self, dirpath): raise ValueError("Model directory {:s} not found".format(dirpath)) log.info("Loading model...") - self.model = spacy.load(MODEL_DIR) + self.model = spacy.load(dirpath) return self diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py index 55f17c4..60c9466 100644 --- a/nerds/test/test_bilstm_ner.py +++ b/nerds/test/test_bilstm_ner.py @@ -3,11 +3,17 @@ from nerds.models import BiLstmCrfNER from nerds.utils import load_data_and_labels +import shutil + def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = BiLstmCrfNER() model.fit(X, y, num_epochs=1) - y_pred = model.predict(X) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) # there is not enough data to train this model properly, so decent # asserts are unlikely to succeed. assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") + shutil.rmtree("nerds/test/data/models") + diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py index a99eed4..8762406 100644 --- a/nerds/test/test_elmo_ner.py +++ b/nerds/test/test_elmo_ner.py @@ -4,6 +4,7 @@ from nerds.utils import load_data_and_labels import numpy as np +import shutil def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") @@ -11,7 +12,10 @@ def test_crf_ner(): # there are 28 unique words in our "vocabulary" embeddings = np.random.random((28, 100)) model.fit(X, y, embeddings=embeddings, num_epochs=1) - y_pred = model.predict(X) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) # there is not enough data to train this model properly, so decent # asserts are unlikely to succeed. assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") + shutil.rmtree("nerds/test/data/models") \ No newline at end of file diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py index 64bc76a..0525b44 100644 --- a/nerds/test/test_spacy_ner.py +++ b/nerds/test/test_spacy_ner.py @@ -3,10 +3,15 @@ from nerds.models import SpacyNER from nerds.utils import load_data_and_labels +import shutil + def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = SpacyNER() model.fit(X, y) - y_pred = model.predict(X) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) assert_equal(y, y_pred, "Label and prediction must be equal") - assert_equal(1.0, model.score(X, y)) + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") \ No newline at end of file From 95bd584dab4953f82932cc78a93332f0040f64a7 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Thu, 21 Nov 2019 11:41:10 -0800 Subject: [PATCH 14/64] flatten and unflatten lists with tests --- nerds/models/base.py | 4 ++-- nerds/test/test_utils.py | 7 ++++++- nerds/utils.py | 40 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/nerds/models/base.py b/nerds/models/base.py index fdd2454..afa3b79 100644 --- a/nerds/models/base.py +++ b/nerds/models/base.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score -from nerds.utils import flatten_lol +from nerds.utils import flatten_list class NERModel(BaseEstimator, ClassifierMixin): """ Provides a basic interface to train NER models and annotate documents. @@ -38,5 +38,5 @@ def score(self, X, y, sample_weights=None): method is needed for GridSearch like operations. """ y_pred = self.predict(X) - return accuracy_score(flatten_lol(y), flatten_lol(y_pred)) + return accuracy_score(flatten_list(y), flatten_list(y_pred)) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index aa96016..a8e0992 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -12,6 +12,11 @@ def test_load_data_and_labels(): def test_flatten_lol(): X, y = load_data_and_labels("nerds/test/data/example.iob") - yflat = flatten_lol(y, strip_prefix=True) + yflat = flatten_list(y, strip_prefix=True, capture_lengths=True) assert_equal(36, len(yflat), "There should be 36 tags in all") assert_equal(5, len([y for y in yflat if y == "PER"]), "There should be 5 PER tags") + y_lengths = compute_list_lengths(y) + y_unflat = unflatten_list(yflat, y_lengths) + assert_equal(len(y), len(y_unflat), "Reconstructed y (y_unflat) should be identical to y") + assert_equal(len(y[0]), len(y_unflat[0]), "Reconstructed y (y_unflat) should be identical to y") + diff --git a/nerds/utils.py b/nerds/utils.py index 0ccd658..bd79188 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -29,7 +29,7 @@ def load_data_and_labels(filepath): return anago.utils.load_data_and_labels(filepath) -def flatten_lol(xs, strip_prefix=True): +def flatten_list(xs, strip_prefix=True, capture_lengths=False): """ Flatten label or predictions from list(list(str)) to list(str). Flattened list can be input to scikit-learn's standard functions to compute various metrics. @@ -37,12 +37,50 @@ def flatten_lol(xs, strip_prefix=True): Args: xs (list(list(str))): list of list of tags (inner list is sentence). strip_prefix (bool): if True, remove leading I- and B-, else retain. + + Returns: + xs_flat list(str): the flattened list. + xs_lengths list(int) or None: a list of lengths of the inner list(str) + of the input xs. """ def strip_bio_prefix(label): return label.split('-')[-1] + if strip_prefix: return [strip_bio_prefix(x) for x in itertools.chain.from_iterable(xs)] else: return [x for x in itertools.chain.from_iterable(xs)] +def compute_list_lengths(xs): + """ Convenience method to return a list of ints representing lengths of + inner lists in xs. + + Args: + xs (list(list(str))): list of list of tags. + + Returns: + xs_lengths (list(int)): list of lengths of inner list. + """ + return [len(x) for x in xs] + + +def unflatten_list(xs_flat, xs_lengths): + """ Reverse operation of flatten_list. Using the flattened list and the list + of list lengths of the inner list, reconstructs original list(list(str)). + + Args: + xs_flat list(str): the flattened list. + xs_lengths list(int): list of inner list to group by. + + Returns: + xs_unflat list(list(str)): original list of list(list(str)) + """ + xs_unflat = [] + start = 0 + for l in xs_lengths: + end = start + l + xs_unflat.append(xs_flat[start:end]) + start = end + return xs_unflat + From 956c13a1198e80b70e5402f5b19321786209f1d4 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 07:37:10 -0800 Subject: [PATCH 15/64] GMB example rewrite with updated API --- examples/GMB/README.md | 193 +++++++++++++++----------- examples/GMB/download_glove.sh | 4 + examples/GMB/read_data.py | 64 --------- examples/GMB/test_models.py | 240 ++++++++++++++++++--------------- nerds/models/dictionary.py | 3 +- nerds/models/elmo.py | 7 +- nerds/test/test_utils.py | 2 +- 7 files changed, 255 insertions(+), 258 deletions(-) create mode 100644 examples/GMB/download_glove.sh delete mode 100644 examples/GMB/read_data.py diff --git a/examples/GMB/README.md b/examples/GMB/README.md index af94284..ceed638 100644 --- a/examples/GMB/README.md +++ b/examples/GMB/README.md @@ -1,109 +1,144 @@ # Dataset description -Annotated Corpus for Named Entity Recognition using GMB (Groningen Meaning Bank) corpus for entity classification with enhanced and popular features by Natural Language Processing applied to the data set. - -# Source - -Downloaded from [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) - -# Overall number of entities - -```python -{'O': 1146068', - 'geo-nam': 58388, - 'org-nam': 48034, - 'per-nam': 23790, - 'gpe-nam': 20680, - 'tim-dat': 12786, - 'tim-dow': 11404, - 'per-tit': 9800, - 'per-fam': 8152, - 'tim-yoc': 5290, - 'tim-moy': 4262, - 'per-giv': 2413, - 'tim-clo': 891, - 'art-nam': 866, - 'eve-nam': 602, - 'nat-nam': 300, - 'tim-nam': 146, - 'eve-ord': 107, - 'per-ini': 60, - 'org-leg': 60, - 'per-ord': 38, - 'tim-dom': 10, - 'per-mid': 1, - 'art-add': 1} +Annotated Corpus for Named Entity Recognition using GMB (Groningen Meaning Bank) corpus for entity classification with enhanced and popular features by Natural Language Processing applied to the data set. Downloaded from [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) to `train.csv` file locally. + +In addition, [GloVe (Global Vectors for Word Representation) vectors](https://nlp.stanford.edu/projects/glove/) are needed to run the ElmoNER model, please download them by running the provided `download_glove.sh` script. + +## Overall number of entities + +``` + 699 art + 561 eve + 45058 geo + 16068 gpe + 252 nat + 36927 org + 34241 per + 26861 tim ``` ## Training -To keep the training time reasonable, we shuffle the sentences from the dataset (with a fixed seed) and train with only the first 5000 instances. +We train with the full set of data, and the entire run across all the provided models can be fairly time consuming. If it is desired to keep the training time reasonable, you can train only with 5000 sentences by uncommenting lines 47-49 in `test_models.py`. ## Results -### CRF (max_iterations=100, c1=0.1, c2=0.1) +### Dictionary NER + +``` + precision recall f1-score support + + art 0.01 0.15 0.02 215 + eve 0.22 0.43 0.29 169 + geo 0.35 0.74 0.48 13724 + gpe 0.93 0.90 0.92 4850 + nat 0.27 0.53 0.36 94 + org 0.41 0.67 0.51 10884 + per 0.77 0.74 0.75 10342 + tim 0.14 0.92 0.25 8140 + + micro avg 0.32 0.77 0.45 48418 + macro avg 0.39 0.64 0.45 48418 +weighted avg 0.48 0.77 0.55 48418 +``` + +### CRF NER (max_iterations=100, c1=0.1, c2=0.1) ``` -Label: art 0.1875 0.0410958904109589 0.06741573033707865 -Label: org 0.696551724137931 0.6350665054413543 0.6643896268184694 -Label: geo 0.763373190685966 0.7818744359932964 0.7725130556616991 -Label: nat 0.23076923076923078 0.07317073170731707 0.1111111111111111 -Label: gpe 0.9410415984277759 0.9048818897637795 0.9226075786769428 -Label: per 0.7281134401972873 0.7013064133016627 0.7144585601935873 -Label: eve 0.5348837209302325 0.3709677419354839 0.4380952380952381 -Label: tim 0.887613454351308 0.8149509803921569 0.849731663685152 + precision recall f1-score support + + art 0.28 0.05 0.08 215 + eve 0.54 0.33 0.41 169 + geo 0.87 0.89 0.88 13724 + gpe 0.95 0.92 0.94 4850 + nat 0.71 0.32 0.44 94 + org 0.80 0.78 0.79 10884 + per 0.88 0.88 0.88 10342 + tim 0.93 0.87 0.90 8140 + + micro avg 0.87 0.85 0.86 48418 + macro avg 0.74 0.63 0.66 48418 +weighted avg 0.87 0.85 0.86 48418 + ``` The entity types which have enough examples have good results! -## Spacy (num_epochs=20, dropout=0.1) +## SpaCy NER (num_epochs=20, dropout=0.1) ``` -Label: art 0.09090909090909091 0.0410958904109589 0.056603773584905655 -Label: org 0.6371129427489418 0.6756437514764942 0.6558128869525338 -Label: geo 0.8506005015177511 0.8454676636494818 0.8480263157894736 -Label: nat 0.3333333333333333 0.075 0.12244897959183673 -Label: gpe 0.9200373366521468 0.9295818924866395 0.9247849882720875 -Label: per 0.7063857801184991 0.6349112426035503 0.6687441570582736 -Label: eve 0.625 0.24193548387096775 0.3488372093023256 -Label: tim 0.8101326899879373 0.8220318237454101 0.8160388821385176 + precision recall f1-score support + + art 0.26 0.07 0.10 215 + eve 0.61 0.24 0.34 169 + geo 0.87 0.87 0.87 13724 + gpe 0.94 0.93 0.93 4850 + nat 0.87 0.28 0.42 94 + org 0.79 0.77 0.78 10884 + per 0.85 0.90 0.88 10342 + tim 0.96 0.83 0.89 8140 + + micro avg 0.87 0.85 0.86 48418 + macro avg 0.77 0.61 0.65 48418 +weighted avg 0.87 0.85 0.86 48418 + ``` -## BiLSTM-CRF (char_emb_size=32, word_emb_size=128, char_lstm_units=32, word_lstm_units=128, dropout=0.1, batch_size=16, learning_rate=0.001, num_epochs=10) +## BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) ``` -Label: art 0.05714285714285714 0.0273972602739726 0.037037037037037035 -Label: org 0.29821791112113694 0.31824747231584016 0.3079073017351811 -Label: geo 0.6431408898305084 0.6262248581743166 0.634570159393781 -Label: nat 0.0 0.0 0.0 -Label: gpe 0.890119760479042 0.9167437557816837 0.9032356068661704 -Label: per 0.1862842070557204 0.15786532550991897 0.17090139140955837 -Label: eve 0.3870967741935484 0.1935483870967742 0.25806451612903225 -Label: tim 0.7139974779319042 0.671967718965108 0.6923453167033504 + precision recall f1-score support + + art 0.25 0.09 0.14 215 + eve 0.37 0.29 0.33 169 + geo 0.84 0.89 0.87 13724 + gpe 0.95 0.93 0.94 4850 + nat 0.71 0.31 0.43 94 + org 0.84 0.72 0.77 10884 + per 0.87 0.90 0.89 10342 + tim 0.89 0.89 0.89 8140 + + micro avg 0.86 0.85 0.86 48418 + macro avg 0.72 0.63 0.66 48418 +weighted avg 0.86 0.85 0.85 48418 + ``` -## Pooling ensemble +## ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) ``` -Label: art 0.0759493670886076 0.0821917808219178 0.07894736842105263 -Label: org 0.398450340455506 0.7880195031344324 0.5292787524366471 -Label: geo 0.6124665775401069 0.9238719435341568 0.7366093859913576 -Label: nat 0.19230769230769232 0.125 0.15151515151515152 -Label: gpe 0.830937167199148 0.9550183598531212 0.8886674259681092 -Label: per 0.43697978596908443 0.813503043718871 0.5685554051440727 -Label: eve 0.37142857142857144 0.41935483870967744 0.393939393939394 -Label: tim 0.6545580349420516 0.888262910798122 0.7537097898615676 + precision recall f1-score support + + art 0.13 0.15 0.14 215 + eve 0.35 0.46 0.40 169 + geo 0.88 0.89 0.88 13724 + gpe 0.94 0.94 0.94 4850 + nat 0.71 0.21 0.33 94 + org 0.82 0.76 0.79 10884 + per 0.86 0.93 0.89 10342 + tim 0.91 0.88 0.90 8140 + + micro avg 0.87 0.87 0.87 48418 + macro avg 0.70 0.65 0.66 48418 +weighted avg 0.87 0.87 0.86 48418 + ``` -## Majority voting ensemble +## Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` -Label: art 0.4 0.0273972602739726 0.05128205128205128 -Label: org 0.7885109114249037 0.5937651039149348 0.6774193548387097 -Label: geo 0.882788868723533 0.7560880829015544 0.814540887524421 -Label: nat 1.0 0.024390243902439025 0.047619047619047616 -Label: gpe 0.9580103359173127 0.9286161552911709 0.9430842607313196 -Label: per 0.8227743271221533 0.5885663507109005 0.686237264721119 -Label: eve 0.8421052631578947 0.25806451612903225 0.3950617283950617 -Label: tim 0.9210599721059972 0.80615234375 0.8597838823069914 + precision recall f1-score support + + art 0.17 0.08 0.11 215 + eve 0.47 0.22 0.30 169 + geo 0.83 0.87 0.85 13724 + gpe 0.98 0.89 0.93 4850 + nat 0.76 0.31 0.44 94 + org 0.84 0.64 0.73 10884 + per 0.93 0.71 0.81 10342 + tim 0.90 0.86 0.88 8140 + + micro avg 0.87 0.78 0.82 48418 + macro avg 0.73 0.57 0.63 48418 +weighted avg 0.87 0.78 0.82 48418 ``` diff --git a/examples/GMB/download_glove.sh b/examples/GMB/download_glove.sh new file mode 100644 index 0000000..38a5a99 --- /dev/null +++ b/examples/GMB/download_glove.sh @@ -0,0 +1,4 @@ +#!/bin/bash +wget http://nlp.stanford.edu/data/glove.6B.zip +unzip -a glove.6B.zip +rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt diff --git a/examples/GMB/read_data.py b/examples/GMB/read_data.py deleted file mode 100644 index fe3f119..0000000 --- a/examples/GMB/read_data.py +++ /dev/null @@ -1,64 +0,0 @@ -import csv - -from nerds.core.model.input.document import Document -from nerds.util.convert import transform_bio_tags_to_annotated_document - - -PATH_TO_FILE = "train.csv" - - -def read_kaggle_data(): - sentences = [] - pos = [] - tag = [] - - tmp_sentence = [] - tmp_pos = [] - tmp_tag = [] - - with open(PATH_TO_FILE, "rt") as csvfile: - csv_reader = csv.reader(csvfile, delimiter=',', quotechar='"') - # Ignore the header - next(csv_reader) - - for row in csv_reader: - - if row[0].startswith("Sentence: "): - if len(tmp_sentence) != 0: - sentences.append(tmp_sentence) - pos.append(tmp_pos) - tag.append(tmp_tag) - - tmp_sentence = [] - tmp_pos = [] - tmp_tag = [] - - tmp_sentence.append(row[1]) - tmp_pos.append(row[2]) - tmp_tag.append(row[3].replace("-", "_")) - - if len(tmp_sentence) != 0: - sentences.append(tmp_sentence) - pos.append(tmp_pos) - - return sentences, pos, tag - - -def data_to_annotated_docs(): - sentences, pos, tags = read_kaggle_data() - - documents = [Document(u" ".join(sentence).encode("utf-8")) - for sentence in sentences] - - ann_docs = [] - for i in range(len(documents)): - try: - sentence = sentences[i] - tag = tags[i] - document = documents[i] - ann_docs.append(transform_bio_tags_to_annotated_document(sentence, - tag, - document)) - except IndexError: - continue - return ann_docs diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py index 475326b..96fbdcc 100644 --- a/examples/GMB/test_models.py +++ b/examples/GMB/test_models.py @@ -1,111 +1,133 @@ -import random +import csv +import os import shutil -from nerds.core.model.ner.crf import CRF -from nerds.core.model.ner.spacy import SpaCyStatisticalNER -from nerds.core.model.ner.bilstm import BidirectionalLSTM -from nerds.core.model.ner.ensemble import ( - NERModelEnsembleMajorityVote, NERModelEnsemblePooling) -from nerds.core.model.eval.score import calculate_precision_recall_f1score - -from read_data import data_to_annotated_docs - -X = data_to_annotated_docs() -print("Loaded data: ", len(X), "data points") -random.Random(42).shuffle(X) - -entity_names = ['art', 'org', 'geo', 'nat', 'gpe', 'per', 'eve', 'tim'] -print("All labels: ", entity_names) - -train_test_split = 0.8 -train_X = X[:int(0.8 * len(X))] -test_X = X[int(0.8 * len(X)):] -print("Training: ", len(train_X)) -print("Training: ", len(test_X)) - - -def test_CRF(): - crf_model = CRF() - crf_model.fit(train_X[:5000]) - - X_pred = crf_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - crf_model.save("tmp") - - -def test_spacy(): - spacy_model = SpaCyStatisticalNER() - # Using the entire dataset will make Spacy die! - spacy_model.fit(train_X[:5000]) - - X_pred = spacy_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - spacy_model.save("tmp") - - -def test_LSTM(): - lstm_model = BidirectionalLSTM() - lstm_model.fit(train_X[:5000]) - - X_pred = lstm_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - lstm_model.save("tmp") - - -def test_ensembles(): - lstm_model = BidirectionalLSTM() - lstm_model.load("tmp") - spacy_model = SpaCyStatisticalNER() - spacy_model.load("tmp") - crf_model = CRF() - crf_model.load("tmp") - - models = [lstm_model, crf_model, spacy_model] - ens1 = NERModelEnsembleMajorityVote(models) - ens2 = NERModelEnsemblePooling(models) - - X_pred_1 = ens1.transform(test_X) - print("Majority Vote: \n") - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred_1, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - X_pred_2 = ens2.transform(test_X) - print("Pooling: \n") - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred_2, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - -test_LSTM() -test_CRF() -test_spacy() -test_ensembles() - -# Clean-up the model dirs. -shutil.rmtree("tmp/") +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.utils import shuffle + +from nerds.models import ( + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, ElmoNER, EnsembleNER +) +from nerds.utils import * + +def convert_to_iob_format(input_file, output_file): + num_written = 0 + fout = open(output_file, "w") + with open(input_file, "r", encoding="iso-8859-1") as fin: + csv_reader = csv.reader(fin, delimiter=',', quotechar='"') + # skip header + next(csv_reader) + for line in csv_reader: + sid, token, pos, tag = line + if num_written > 0: + if len(sid) != 0: + # end of sentence marker + fout.write("\n") + fout.write("\t".join([token, tag]) + "\n") + num_written += 1 + + fout.write("\n") + fout.close() + +# convert Kaggle dataset to our standard IOB format +if not os.path.exists("train.iob"): + convert_to_iob_format("train.csv", "train.iob") + +# these are our entities +entity_labels = ["art", "eve", "geo", "gpe", "nat", "org", "per", "tim"] + +# make model directory to store our models +if not os.path.exists("models"): + os.makedirs("models") + +# read IOB file +data, labels = load_data_and_labels("train.iob") +# optional: restrict dataset to 5000 sentences +# data_s, labels_s = shuffle(data, labels, random_state=42) +# data = data_s +# labels = labels_s +print(len(data), len(labels)) + +# split into train and test set +xtrain, xtest, ytrain, ytest = train_test_split(data, labels, + test_size=0.3, random_state=42) +print(len(xtrain), len(ytrain), len(xtest), len(ytest)) + +# train and test the dictionary NER +model = DictionaryNER() +model.fit(xtrain, ytrain) +model.save("models/dict_model") +trained_model = model.load("models/dict_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the CRF NER +model = CrfNER() +model.fit(xtrain, ytrain) +model.save("models/crf_model") +trained_model = model.load("models/crf_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the SpaCy NER +model = SpacyNER() +model.fit(xtrain, ytrain) +model.save("models/spacy_model") +trained_model = model.load("models/spacy_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the BiLSTM-CRF NER +model = BiLstmCrfNER() +model.fit(xtrain, ytrain) +model.save("models/bilstm_model") +trained_model = model.load("models/bilstm_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the ELMo NER +if os.path.exists("glove.6B.100d.txt"): + model = ElmoNER() + model.fit(xtrain, ytrain) + model.save("models/elmo_model") + trained_model = model.load("models/elmo_model") + ypred = trained_model.predict(xtest) + print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# create and test an ensemble +dict_model = DictionaryNER() +dict_model.load("models/dict_model") +crf_model = CrfNER() +crf_model.load("models/crf_model") +spacy_model = SpacyNER() +spacy_model.load("models/spacy_model") +bilstm_model = BiLstmCrfNER() +bilstm_model.load("models/bilstm_model") +model = EnsembleNER() +model.fit(xtrain, ytrain, + estimators=[ + (dict_model, {}), + (crf_model, {}), + (spacy_model, {}), + (bilstm_model, {}) + ], + is_pretrained=True) +ypred = model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# clean up +shutil.rmtree("models") +os.remove("train.iob") \ No newline at end of file diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index 2604781..d0de654 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -150,8 +150,9 @@ def _add_if_not_spurious_match(self, start_index, end_index, tag, matched_phrases (list(str)): list of matched phrases, updated in place by function. """ + # print(start_index, end_index, tag, sentence, len(sentence)) if start_index == 0: - if end_index < len(sentence): + if end_index + 1 < len(sentence): if sentence[end_index + 1] == " ": matched_phrases.append((start_index, end_index + 1, tag)) elif end_index + 1 == len(sentence): diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py index 22deeef..33d6ca6 100644 --- a/nerds/models/elmo.py +++ b/nerds/models/elmo.py @@ -41,11 +41,10 @@ def fit(self, X, y, fc_dim=100, dropout=0.5, embeddings=None, - embeddings_file=None, - embeddings_dim=None, + embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, - num_epochs=10): + num_epochs=2): """ Trains the NER model. Input is list of AnnotatedDocuments. Args: @@ -59,7 +58,6 @@ def fit(self, X, y, dropout (float): dropout rate. embeddings (numpy array): word embedding matrix. embeddings_file (str): path to embedding file. - embeddings_dim (int): size of embedding vector. use_char (boolean): add char feature. use_crf (boolean): use crf as last layer. batch_size training batch size. @@ -75,6 +73,7 @@ def fit(self, X, y, if embeddings is None: embeddings = load_glove(embeddings_file) + embeddings_dim != embeddings[list(embeddings.keys())[0]].shape[0] embeddings = filter_embeddings(embeddings, self.preprocessor._word_vocab.vocab, embeddings_dim) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index a8e0992..ac42f00 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -10,7 +10,7 @@ def test_load_data_and_labels(): assert_equal(len(X[0]), len(y[0]), "Number of tokens should be equal to number of tags") -def test_flatten_lol(): +def test_flatten_and_unflatten_list(): X, y = load_data_and_labels("nerds/test/data/example.iob") yflat = flatten_list(y, strip_prefix=True, capture_lengths=True) assert_equal(36, len(yflat), "There should be 36 tags in all") From 692c6b9e30f37b585d03cf933a0e823b1e38fb4c Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 07:44:40 -0800 Subject: [PATCH 16/64] fix section levels --- examples/GMB/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/GMB/README.md b/examples/GMB/README.md index ceed638..27c281d 100644 --- a/examples/GMB/README.md +++ b/examples/GMB/README.md @@ -64,7 +64,7 @@ weighted avg 0.87 0.85 0.86 48418 The entity types which have enough examples have good results! -## SpaCy NER (num_epochs=20, dropout=0.1) +### SpaCy NER (num_epochs=20, dropout=0.1) ``` precision recall f1-score support @@ -84,7 +84,7 @@ weighted avg 0.87 0.85 0.86 48418 ``` -## BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) ``` precision recall f1-score support @@ -104,7 +104,7 @@ weighted avg 0.86 0.85 0.85 48418 ``` -## ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) ``` precision recall f1-score support @@ -124,7 +124,7 @@ weighted avg 0.87 0.87 0.86 48418 ``` -## Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) +### Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` precision recall f1-score support From 3025e858131f5a798cf2f34dbabd2edbd71d3233 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 07:49:50 -0800 Subject: [PATCH 17/64] also remove zip file --- examples/GMB/download_glove.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 examples/GMB/download_glove.sh diff --git a/examples/GMB/download_glove.sh b/examples/GMB/download_glove.sh old mode 100644 new mode 100755 index 38a5a99..4ecf263 --- a/examples/GMB/download_glove.sh +++ b/examples/GMB/download_glove.sh @@ -1,4 +1,4 @@ #!/bin/bash wget http://nlp.stanford.edu/data/glove.6B.zip unzip -a glove.6B.zip -rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt +rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt glove.6B.zip From 1885d92d26dffde139ccf0518069cdffb8d6b64e Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 10:21:33 -0800 Subject: [PATCH 18/64] moving artifacts to dedicated docs directory --- README.md | 6 +++++- CONTRIBUTING.md => docs/CONTRIBUTING.md | 0 nerds.png => docs/nerds.png | Bin 3 files changed, 5 insertions(+), 1 deletion(-) rename CONTRIBUTING.md => docs/CONTRIBUTING.md (100%) rename nerds.png => docs/nerds.png (100%) diff --git a/README.md b/README.md index 314954e..2300d81 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # nerds -![nerds logo](nerds.png) +![nerds logo](docs/nerds.png) # How to set up a DEV environment @@ -113,3 +113,7 @@ New models and input adapters are always welcome. Please make sure your code is * `make test` shows that all the unit test pass. * `make lint` shows no Python code violations. + +The [CONTRIBUTING.md file](docs/CONTRIBUTING.md) lists contributors who have contributed to the [NERDS (elsevierlabs-os/nerds)](https://github.com/elsevierlabs-os/nerds) project. + + diff --git a/CONTRIBUTING.md b/docs/CONTRIBUTING.md similarity index 100% rename from CONTRIBUTING.md rename to docs/CONTRIBUTING.md diff --git a/nerds.png b/docs/nerds.png similarity index 100% rename from nerds.png rename to docs/nerds.png From 67171aedbb2486bf696a8295c5688e83ce7c17cd Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 10:49:00 -0800 Subject: [PATCH 19/64] remove version capping for SpaCy --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 045b3bf..2a6c2da 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ 'scipy', 'sklearn', 'sklearn-crfsuite', - 'spacy==2.0.11', + 'spacy', 'tensorflow', 'torch' ], From 172ce09f28383232da35a2ee1b93c2436b3388b8 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 10:49:51 -0800 Subject: [PATCH 20/64] remove glove embeddings file during cleanup --- examples/GMB/test_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py index 96fbdcc..1429aea 100644 --- a/examples/GMB/test_models.py +++ b/examples/GMB/test_models.py @@ -130,4 +130,5 @@ def convert_to_iob_format(input_file, output_file): # clean up shutil.rmtree("models") -os.remove("train.iob") \ No newline at end of file +os.remove("train.iob") +os.remove("glove.6B.100d.txt") \ No newline at end of file From 39d2c9694694bff997c9daccaf4bde25ec1e1837 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 11:03:38 -0800 Subject: [PATCH 21/64] updating README to describe forked code --- README.md | 44 +++++++++++++++++++++++++++++--------------- docs/CHANGES.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) create mode 100644 docs/CHANGES.md diff --git a/README.md b/README.md index 2300d81..f3a9b58 100644 --- a/README.md +++ b/README.md @@ -75,37 +75,48 @@ NERDS is a framework that provides some NER capabilities - among which the optio ## Understanding the main data exchange classes -There are 3 main classes in the `nerds.core.model.input.*` package that are used in our NER models: `Document`, `Annotation` and `AnnotatedDocument`. +The NERDS master project on [elsevierlabs-os/nerds](https://github.com/elsevierlabs-os/nerds) project uses a set of custom data exchange classes `Document`, `Annotation`, and `AnnotatedDocument`. The project provided a set of conversion utilities which could be used to convert provided datasets to this format, and convert instances of these classes back to whatever format the underlying wrapped NER model needed. However, this NERDS fork on [sujitpal/nerds](https://github.com/sujitpal/nerds) eliminates this requirement -- the internal format is just a list of list of tokens (words in sentence) or BIO tags. The utility function `nerds.utils.load_data_and_labels` can read a file in ConLL BIO format and convert to this internal format. This decision was made because 3 of the 5 provided models consume the list of list format natively, and the result is fewer lines of extra code and less potential for error. -A `Document` class is the abstract representation of a raw document. It should always implement the `plain_text_` attribute, that returns the plain text representation of the object, as it's the one where we are going to perform NER. Therefore, whenever we want to process any new type of document format - XML, PDF, JSON, brat, etc. - the only requirement is to write an adapter that reads the file(s) from an input directory and transforms them to `Document` objects. The default `Document` object works seamlessly with `.txt` files. +In general, when given an input format that is not in ConLL BIO format, the main effort in using NERDS would be to convert it to ConLL BIO format. Once that is done, it is relatively easy to ingest it into a data and label structure, as shown below. -The `Annotation` class contains the data for a single annotation. This is the text (e.g. "fox"), the label (e.g. "ANIMAL") and the offsets that correspond to offsets in the `plain_text_` representation of a `Document` (e.g. 40-42). +```python +from nerds.utils import load_data_and_labels -> **Important to note**: The offsets is a 2-tuple of integers that represent the position of the first and the last character of the annotation. Be careful, because some libraries end the offset one character **after** the final character i.e. at `start_offset + len(word)`. This is not the case with us, we currently end the offsets at **exactly** the final character i.e. at `start_offset + len(word) - 1`. - -Finally, the `AnnotatedDocument` class is a combination of `Document` and a list of `Annotation`, and it can represent two things: +data, labels = load_data_and_labels("nerds/test/data/example.iob") +print("data:", data) +print("labels:", labels) +``` -* Ground truth data (e.g. brat annotation files). -* Predictions on documents after they run through our NER models. +yields the following output. -The `AnnotatedDocument` class exposes the `annotated_text_` attribute which returns the plain text representation of the document with inline annotations. +``` +data: [ + ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.'], + ['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.', ',', 'the', 'Dutch', 'publishing', 'group', '.'] +] +labels [ + ['B-PER', 'I-PER', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O'], + ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] +] +``` ## Extending the base model class -The basic class that every model needs to extend is the `NERModel` class in the `nerds.core.model.ner.base` package. The model class implements a `fit - transform` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: +The basic class that every model needs to extend is the `NERModel` class in the `nerds.models` package. The model class implements a `fit - transform` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: -* `fit`: Trains a model given a list of `AnnotatedDocument` objects. -* `transform`: Gets a list of `Document` objects and transforms them to `AnnotatedDocument`. +* `fit`: Trains a model given a list of list of tokens and BIO tags. +* `predict`: Returns a list of list of BIO tags, given a list of list of tokens. * `save`: Disk persistence of a model. * `load`: Disk persistence of a model. -Please note that **all** of the class methods, utility functions, etc. should operate on `Document` and `AnnotatedDocument` objects, to maintain compatibility with the rest of the framework. The only exception is "private" methods used internally in classes. +As a best practice, I like to implement a single NER model as a single file in the `models` folder, but have it be accessible from client code directly as `nerds.models.CustomNER`. You can set this redirection up in `nerds/models/__init__.py`. # Running experiments -So, let's assume you have a dataset that contains annotated text. If it's in a format that is already supported (e.g. [brat](http://brat.nlplab.org/standoff.html)), then you may just load it into `AnnotatedDocument` objects using the built-in classes. Otherwise, you will have to extend the `nerds.core.model.input.DataInput` class to support the format. Then, you may use the built-in NER models (or create your own) either alone, or in an ensemble and evaluate their predictive capabilities on your dataset. +There are two examples of running experiments using NERDS. We will continue to update these examples as more functionality becomes available. -In the `nerds.core.model.evaluate` package, there are helper methods and classes to perform k-fold cross-validation. Please, refer to the `nerds.examples` package where you may look at working code examples with real datasets. +* [examples/GMB](examples/GMB) +* [examples/BioNLP](examples/BioNLP) # Contributing to the project @@ -116,4 +127,7 @@ New models and input adapters are always welcome. Please make sure your code is The [CONTRIBUTING.md file](docs/CONTRIBUTING.md) lists contributors who have contributed to the [NERDS (elsevierlabs-os/nerds)](https://github.com/elsevierlabs-os/nerds) project. +# Changes / Improvements in this Fork + +The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that were made in this fork. diff --git a/docs/CHANGES.md b/docs/CHANGES.md new file mode 100644 index 0000000..c1cdbff --- /dev/null +++ b/docs/CHANGES.md @@ -0,0 +1,43 @@ +# Improvements and Changes + +## Completed + +* Replace AnnotatedDocument common data format to List of List format borrowed from Anago. +* Model + * NERModel -- base class extends ClassifierMixin, so exposes predict() instead of transform(). + * DictionaryNER + * similar to ExactMatchDictionaryNER except + * takes Anago style IO + * handles multiple classes (as well as single class as special case) + * can handle Anago style input via fit(X, y, combine_tokens=True) and dictionary style input via fit(X, y, combine_tokens=False). + * CrfNER + * similar to CRF except + * takes Anago style IO (native IO format to wrapped model sklearn_crfsuite.CRF) + * replaces dependency on nltk.tokenize_pos() to SpaCy + * allows features to be directly passed to fit() using is_featurized=False. + * SpacyNER + * similar to SpacyStatisticalNER, except + * takes Anago style IO + * more robust to large data sizes, uses mini-batches for training + * BiLstmCrfNER + * similar to BidirectionalLSTM except + * takes Anago style IO + * works against most recent Anago API changes + * does not give timestep size errors + * ElmoNER + * New, available in Anago, same API as Anago's BiLSTMCRF + * EnsembleNER + * simpler interface + * weights from each classifier + * joblib.Parallel -- improve? +* Utils + * Thin wrapper over anago's `load_data_and_labels` + * Converter for output so scikit-learn metrics can be used. + +* Other stuff + * remove deprecated sklearn.external.joblib -> joblib + +## Planned + +* Scikit-Learn interoperability. +* BERT Transformer based NER From 48cdb3fac51dd4b4f2184766d475803bfb639c64 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 13:35:20 -0800 Subject: [PATCH 22/64] example usage on BioNLP dataset --- examples/BioNLP/README.md | 119 +++++++++++++++++++++++++++++++++ examples/BioNLP/data_prep.sh | 24 +++++++ examples/BioNLP/test_models.py | 102 ++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) create mode 100644 examples/BioNLP/README.md create mode 100755 examples/BioNLP/data_prep.sh create mode 100644 examples/BioNLP/test_models.py diff --git a/examples/BioNLP/README.md b/examples/BioNLP/README.md new file mode 100644 index 0000000..10dc191 --- /dev/null +++ b/examples/BioNLP/README.md @@ -0,0 +1,119 @@ +# Dataset description + +Data comes from the [Report on Bio-Entity Recognition Task at BioNLP/NLPBA 2004](http://www.nactem.ac.uk/tsujii/GENIA/ERtask/report.html) page. The page describes the provenance and characteristics of the data. + +In addition, [GloVe (Global Vectors for Word Representation) vectors](https://nlp.stanford.edu/projects/glove/) are needed to run the ElmoNER model. + +To make the data available for use by our example, execute the script `data_prep.sh` in the current directory. This script will create a `data` directory, and also download the GloVe vectors needed by the example. + +## Entity distribution + +``` + 25307 DNA + 2481 RNA + 11217 cell_line + 15466 cell_type + 55117 protein +``` + +## Training + +Our example will use the `data/train/Genia4ERtask1.iob2` file for training, and the `data/test/Genia4EReval1.iob2` file for evaluation. Both files are already in BIO format. Entity distribution shown above is for training data. + +## Results + +### Dictionary NER + +``` + precision recall f1-score support + + cell_line 0.63 0.47 0.54 1489 + cell_type 0.71 0.63 0.67 4912 + protein 0.72 0.65 0.69 9841 + DNA 0.63 0.50 0.56 2845 + RNA 0.57 0.46 0.51 305 + + micro avg 0.70 0.61 0.65 19392 + macro avg 0.65 0.54 0.59 19392 +weighted avg 0.70 0.61 0.65 19392 +``` + +### CRF NER (max_iterations=100, c1=0.1, c2=0.1) + +``` + precision recall f1-score support + + cell_line 0.58 0.70 0.63 1489 + cell_type 0.88 0.71 0.79 4912 + protein 0.79 0.80 0.80 9841 + DNA 0.77 0.73 0.75 2845 + RNA 0.77 0.72 0.74 305 + + micro avg 0.79 0.76 0.77 19392 + macro avg 0.76 0.73 0.74 19392 +weighted avg 0.79 0.76 0.77 19392 +``` + +### SpaCy NER (num_epochs=20, dropout=0.1) + +``` + precision recall f1-score support + + cell_line 0.56 0.76 0.65 1489 + cell_type 0.89 0.66 0.76 4912 + protein 0.78 0.84 0.81 9841 + DNA 0.77 0.76 0.77 2845 + RNA 0.77 0.76 0.77 305 + + micro avg 0.78 0.78 0.78 19392 + macro avg 0.76 0.76 0.75 19392 +weighted avg 0.79 0.78 0.78 19392 +``` + +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) + +``` + precision recall f1-score support + + cell_line 0.53 0.77 0.63 1489 + cell_type 0.88 0.71 0.78 4912 + protein 0.81 0.78 0.79 9841 + DNA 0.73 0.83 0.78 2845 + RNA 0.80 0.78 0.79 305 + + micro avg 0.78 0.77 0.77 19392 + macro avg 0.75 0.77 0.76 19392 +weighted avg 0.79 0.77 0.78 19392 +``` + +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) + +``` + precision recall f1-score support + + cell_line 0.53 0.73 0.61 1489 + cell_type 0.85 0.75 0.79 4912 + protein 0.80 0.87 0.83 9841 + DNA 0.77 0.86 0.81 2845 + RNA 0.77 0.86 0.81 305 + + micro avg 0.78 0.82 0.80 19392 + macro avg 0.74 0.81 0.77 19392 +weighted avg 0.79 0.82 0.80 19392 +``` + +### Majority voting ensemble (pre-trained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) + +``` + precision recall f1-score support + + cell_line 0.67 0.70 0.69 1489 + cell_type 0.91 0.69 0.78 4912 + protein 0.83 0.77 0.80 9841 + DNA 0.83 0.74 0.78 2845 + RNA 0.81 0.73 0.77 305 + + micro avg 0.84 0.74 0.78 19392 + macro avg 0.81 0.73 0.76 19392 +weighted avg 0.84 0.74 0.78 19392 +``` diff --git a/examples/BioNLP/data_prep.sh b/examples/BioNLP/data_prep.sh new file mode 100755 index 0000000..0b71e7c --- /dev/null +++ b/examples/BioNLP/data_prep.sh @@ -0,0 +1,24 @@ +#!/bin/bash +echo "Creating directories..." +mkdir data +cd data +mkdir train test + +echo "Downloading training data..." +cd train +curl -O http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtraining.tar.gz +tar xvf Genia4ERtraining.tar.gz +rm Genia4ERtraining.tar.gz + +echo "Downloading test data..." +cd ../test +curl -O http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtest.tar.gz +tar xvf Genia4ERtest.tar.gz +rm Genia4ERtest.tar.gz + +cd ../.. +echo "Downloading GloVe embeddings..." +wget http://nlp.stanford.edu/data/glove.6B.zip +unzip -a glove.6B.zip +rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt glove.6B.zip + diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py new file mode 100644 index 0000000..8e5eef7 --- /dev/null +++ b/examples/BioNLP/test_models.py @@ -0,0 +1,102 @@ +import os +import shutil + +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.utils import shuffle + +from nerds.models import ( + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, ElmoNER, EnsembleNER +) +from nerds.utils import * + +# these are our entities +entity_labels = ["cell_line", "cell_type", "protein", "DNA", "RNA"] + +# load data +xtrain, ytrain = load_data_and_labels("data/train/Genia4ERtask1.iob2") +xtest, ytest = load_data_and_labels("data/test/Genia4EReval1.iob2") +print(len(xtrain), len(ytrain), len(xtest), len(xtest)) + +# make model directory to store our models +if not os.path.exists("models"): + os.makedirs("models") + +# # train and test the Dictionary NER +# model = DictionaryNER() +# model.fit(xtrain, ytrain) +# model.save("models/dict_model") +# trained_model = model.load("models/dict_model") +# ypred = trained_model.predict(xtest) +# print(classification_report(flatten_list(ytest, strip_prefix=True), +# flatten_list(ypred, strip_prefix=True), +# labels=entity_labels)) + +# # train and test the CRF NER +# model = CrfNER() +# model.fit(xtrain, ytrain) +# model.save("models/crf_model") +# trained_model = model.load("models/crf_model") +# ypred = trained_model.predict(xtest) +# print(classification_report(flatten_list(ytest, strip_prefix=True), +# flatten_list(ypred, strip_prefix=True), +# labels=entity_labels)) + +# # train and test the SpaCy NER +# model = SpacyNER() +# model.fit(xtrain, ytrain) +# model.save("models/spacy_model") +# trained_model = model.load("models/spacy_model") +# ypred = trained_model.predict(xtest) +# print(classification_report(flatten_list(ytest, strip_prefix=True), +# flatten_list(ypred, strip_prefix=True), +# labels=entity_labels)) + +# # train and test the BiLSTM-CRF NER +# model = BiLstmCrfNER() +# model.fit(xtrain, ytrain) +# model.save("models/bilstm_model") +# trained_model = model.load("models/bilstm_model") +# ypred = trained_model.predict(xtest) +# print(classification_report(flatten_list(ytest, strip_prefix=True), +# flatten_list(ypred, strip_prefix=True), +# labels=entity_labels)) + +# # train and test the ELMo NER +# if os.path.exists("glove.6B.100d.txt"): +# model = ElmoNER() +# model.fit(xtrain, ytrain) +# model.save("models/elmo_model") +# trained_model = model.load("models/elmo_model") +# ypred = trained_model.predict(xtest) +# print(classification_report(flatten_list(ytest, strip_prefix=True), +# flatten_list(ypred, strip_prefix=True), +# labels=entity_labels)) + +# create and test an ensemble +dict_model = DictionaryNER() +dict_model.load("models/dict_model") +crf_model = CrfNER() +crf_model.load("models/crf_model") +spacy_model = SpacyNER() +spacy_model.load("models/spacy_model") +bilstm_model = BiLstmCrfNER() +bilstm_model.load("models/bilstm_model") +model = EnsembleNER() +model.fit(xtrain, ytrain, + estimators=[ + (dict_model, {}), + (crf_model, {}), + (spacy_model, {}), + (bilstm_model, {}) + ], + is_pretrained=True) +ypred = model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# # clean up +# shutil.rmtree("models") +# shutil.rmtree("data") +# os.remove("glove.6B.100d.txt") \ No newline at end of file From 156bf89abafd877f214dd73ef127d1f618ce0a00 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 13:41:04 -0800 Subject: [PATCH 23/64] uncommenting commented code --- examples/BioNLP/test_models.py | 100 ++++++++++++++++----------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py index 8e5eef7..e2a69d0 100644 --- a/examples/BioNLP/test_models.py +++ b/examples/BioNLP/test_models.py @@ -22,56 +22,56 @@ if not os.path.exists("models"): os.makedirs("models") -# # train and test the Dictionary NER -# model = DictionaryNER() -# model.fit(xtrain, ytrain) -# model.save("models/dict_model") -# trained_model = model.load("models/dict_model") -# ypred = trained_model.predict(xtest) -# print(classification_report(flatten_list(ytest, strip_prefix=True), -# flatten_list(ypred, strip_prefix=True), -# labels=entity_labels)) +# train and test the Dictionary NER +model = DictionaryNER() +model.fit(xtrain, ytrain) +model.save("models/dict_model") +trained_model = model.load("models/dict_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) -# # train and test the CRF NER -# model = CrfNER() -# model.fit(xtrain, ytrain) -# model.save("models/crf_model") -# trained_model = model.load("models/crf_model") -# ypred = trained_model.predict(xtest) -# print(classification_report(flatten_list(ytest, strip_prefix=True), -# flatten_list(ypred, strip_prefix=True), -# labels=entity_labels)) +# train and test the CRF NER +model = CrfNER() +model.fit(xtrain, ytrain) +model.save("models/crf_model") +trained_model = model.load("models/crf_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) -# # train and test the SpaCy NER -# model = SpacyNER() -# model.fit(xtrain, ytrain) -# model.save("models/spacy_model") -# trained_model = model.load("models/spacy_model") -# ypred = trained_model.predict(xtest) -# print(classification_report(flatten_list(ytest, strip_prefix=True), -# flatten_list(ypred, strip_prefix=True), -# labels=entity_labels)) +# train and test the SpaCy NER +model = SpacyNER() +model.fit(xtrain, ytrain) +model.save("models/spacy_model") +trained_model = model.load("models/spacy_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) -# # train and test the BiLSTM-CRF NER -# model = BiLstmCrfNER() -# model.fit(xtrain, ytrain) -# model.save("models/bilstm_model") -# trained_model = model.load("models/bilstm_model") -# ypred = trained_model.predict(xtest) -# print(classification_report(flatten_list(ytest, strip_prefix=True), -# flatten_list(ypred, strip_prefix=True), -# labels=entity_labels)) +# train and test the BiLSTM-CRF NER +model = BiLstmCrfNER() +model.fit(xtrain, ytrain) +model.save("models/bilstm_model") +trained_model = model.load("models/bilstm_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) -# # train and test the ELMo NER -# if os.path.exists("glove.6B.100d.txt"): -# model = ElmoNER() -# model.fit(xtrain, ytrain) -# model.save("models/elmo_model") -# trained_model = model.load("models/elmo_model") -# ypred = trained_model.predict(xtest) -# print(classification_report(flatten_list(ytest, strip_prefix=True), -# flatten_list(ypred, strip_prefix=True), -# labels=entity_labels)) +# train and test the ELMo NER +if os.path.exists("glove.6B.100d.txt"): + model = ElmoNER() + model.fit(xtrain, ytrain) + model.save("models/elmo_model") + trained_model = model.load("models/elmo_model") + ypred = trained_model.predict(xtest) + print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) # create and test an ensemble dict_model = DictionaryNER() @@ -96,7 +96,7 @@ flatten_list(ypred, strip_prefix=True), labels=entity_labels)) -# # clean up -# shutil.rmtree("models") -# shutil.rmtree("data") -# os.remove("glove.6B.100d.txt") \ No newline at end of file +# clean up +shutil.rmtree("models") +shutil.rmtree("data") +os.remove("glove.6B.100d.txt") \ No newline at end of file From 7cc402e8d231edf23b6528e29549cf68cb495220 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 13:45:54 -0800 Subject: [PATCH 24/64] API update (transform -> predict) --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f3a9b58..95d658c 100644 --- a/README.md +++ b/README.md @@ -102,14 +102,14 @@ labels [ ## Extending the base model class -The basic class that every model needs to extend is the `NERModel` class in the `nerds.models` package. The model class implements a `fit - transform` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: +The basic class that every model needs to extend is the `NERModel` class in the `nerds.models` package. The model class implements a `fit - predict` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: -* `fit`: Trains a model given a list of list of tokens and BIO tags. -* `predict`: Returns a list of list of BIO tags, given a list of list of tokens. -* `save`: Disk persistence of a model. -* `load`: Disk persistence of a model. +* `fit(X, y)`: Trains a model given a list of list of tokens X and BIO tags y. +* `predict(X)`: Returns a list of list of BIO tags, given a list of list of tokens X. +* `save(dirpath)`: Saves model to directory given by dirpath. +* `load(dirpath)`: Retrieves model from directory given by dirpath. -As a best practice, I like to implement a single NER model as a single file in the `models` folder, but have it be accessible from client code directly as `nerds.models.CustomNER`. You can set this redirection up in `nerds/models/__init__.py`. +As a best practice, I like to implement a single NER model (or group of related NER models) as a single file in the `models` folder, but have it be accessible from client code directly as `nerds.models.CustomNER`. You can set this redirection up in `nerds/models/__init__.py`. # Running experiments From 538069a0f84c71a60c5e5fdffc69fe9242ce6f42 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 22 Nov 2019 19:05:25 -0800 Subject: [PATCH 25/64] remove NLTK, add allenNLP (for ElmoNER) and transformers --- Makefile | 1 - docs/CHANGES.md | 1 + setup.py | 5 +++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index b1a6fb0..e86113c 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ install: download_models: python -m spacy download en - python -m nltk.downloader averaged_perceptron_tagger clean: -rm -rf build diff --git a/docs/CHANGES.md b/docs/CHANGES.md index c1cdbff..2fc5165 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -3,6 +3,7 @@ ## Completed * Replace AnnotatedDocument common data format to List of List format borrowed from Anago. +* Removes dependency on NLTK * Model * NERModel -- base class extends ClassifierMixin, so exposes predict() instead of transform(). * DictionaryNER diff --git a/setup.py b/setup.py index 2a6c2da..ee1cbf0 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ name="nerds", author="Elsevier Content & Innovation", install_requires=[ + 'allennlp', 'anago', 'future', 'h5py', @@ -11,7 +12,6 @@ 'joblib', 'keras', 'networkx==1.11', - 'nltk', 'numpy', 'pyahocorasick', 'pyyaml', @@ -21,7 +21,8 @@ 'sklearn-crfsuite', 'spacy', 'tensorflow', - 'torch' + 'torch', + 'transformers' ], tests_require=[ 'coverage', From 1a9dccc024782b0b4ff5bd711f670cede836cbcb Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 26 Nov 2019 07:28:25 -0800 Subject: [PATCH 26/64] spelling fix --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 95d658c..7702ea0 100644 --- a/README.md +++ b/README.md @@ -75,9 +75,9 @@ NERDS is a framework that provides some NER capabilities - among which the optio ## Understanding the main data exchange classes -The NERDS master project on [elsevierlabs-os/nerds](https://github.com/elsevierlabs-os/nerds) project uses a set of custom data exchange classes `Document`, `Annotation`, and `AnnotatedDocument`. The project provided a set of conversion utilities which could be used to convert provided datasets to this format, and convert instances of these classes back to whatever format the underlying wrapped NER model needed. However, this NERDS fork on [sujitpal/nerds](https://github.com/sujitpal/nerds) eliminates this requirement -- the internal format is just a list of list of tokens (words in sentence) or BIO tags. The utility function `nerds.utils.load_data_and_labels` can read a file in ConLL BIO format and convert to this internal format. This decision was made because 3 of the 5 provided models consume the list of list format natively, and the result is fewer lines of extra code and less potential for error. +The NERDS master project on [elsevierlabs-os/nerds](https://github.com/elsevierlabs-os/nerds) project uses a set of custom data exchange classes `Document`, `Annotation`, and `AnnotatedDocument`. The project provided a set of conversion utilities which could be used to convert provided datasets to this format, and convert instances of these classes back to whatever format the underlying wrapped NER model needed. However, this NERDS fork on [sujitpal/nerds](https://github.com/sujitpal/nerds) eliminates this requirement -- the internal format is just a list of list of tokens (words in sentence) or BIO tags. The utility function `nerds.utils.load_data_and_labels` can read a file in CoNLL BIO format and convert to this internal format. This decision was made because 3 of the 5 provided models consume the list of list format natively, and the result is fewer lines of extra code and less potential for error. -In general, when given an input format that is not in ConLL BIO format, the main effort in using NERDS would be to convert it to ConLL BIO format. Once that is done, it is relatively easy to ingest it into a data and label structure, as shown below. +In general, when given an input format that is not in CoNLL BIO format, the main effort in using NERDS would be to convert it to CoNLL BIO format. Once that is done, it is relatively easy to ingest it into a data and label structure, as shown below. ```python from nerds.utils import load_data_and_labels From e36981fae3f7823526a60f86283ecf111bac8415 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 3 Dec 2019 06:13:12 -0800 Subject: [PATCH 27/64] added utility functions to convert back and forth from tokens-tags to sentence-spans representations. --- nerds/test/test_utils.py | 34 +++++++++++++++++ nerds/utils.py | 80 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index ac42f00..d2ae666 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -2,6 +2,9 @@ from nerds.utils import * +import spacy + +spacy_lm = spacy.load("en") def test_load_data_and_labels(): X, y = load_data_and_labels("nerds/test/data/example.iob") @@ -20,3 +23,34 @@ def test_flatten_and_unflatten_list(): assert_equal(len(y), len(y_unflat), "Reconstructed y (y_unflat) should be identical to y") assert_equal(len(y[0]), len(y_unflat[0]), "Reconstructed y (y_unflat) should be identical to y") +def test_tokens_to_spans(): + data, labels = load_data_and_labels("nerds/test/data/example.iob") + tokens, tags = data[0], labels[0] + sentence, spans = tokens_to_spans(tokens, tags) + assert_equal( + "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .", + sentence, "Sentence reconstruction is incorrect") + assert_equal(3, len(spans), "Should be exactly 3 spans") + assert_equal(0, spans[0][0], "spans[0].start should be 0") + assert_equal(13, spans[0][1], "spans[0].end should be 13") + assert_equal("PER", spans[0][2], "spans[0].cls should be PER") + assert_equal(16, spans[1][0], "spans[1].start should be 16") + assert_equal(28, spans[1][1], "spans[1].end should be 28") + assert_equal("DATE", spans[1][2], "spans[1].cls should be DATE") + assert_equal(78, spans[2][0], "spans[2].start should be 78") + assert_equal(86, spans[2][1], "spans[2].end should be 86") + assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") + +def test_spans_to_tokens(): + sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." + spans = [(0, 11, "PER"), (27, 43, "ORG"), (50, 55, "NORP")] + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm) + # reference tokens and tags for comparison + data, labels = load_data_and_labels("nerds/test/data/example.iob") + ref_tokens, ref_tags = data[1], labels[1] + assert_equal(len(tokens), len(ref_tokens), "Number of tokens should be identical") + for token, ref_token in zip(tokens, ref_tokens): + assert_equal(ref_token, token, "Tokens do not match. {:s} != {:s}".format(ref_token, token)) + assert_equal(len(tags), len(ref_tags), "Number of BIO tags should be identical") + for tag, ref_tag in zip(tags, ref_tags): + assert_equal(ref_tag, tag, "Tags do not match. {:s} != {:s}".format(ref_tag, tag)) diff --git a/nerds/utils.py b/nerds/utils.py index bd79188..214ca32 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -84,3 +84,83 @@ def unflatten_list(xs_flat, xs_lengths): start = end return xs_unflat + +def tokens_to_spans(tokens, tags): + """ Convert from tokens-tags format to sentence-span format. Some NERs + use the sentence-span format, so we need to transform back and forth. + + Args: + tokens (list(str)): list of tokens representing single sentence. + tags (list(str)): list of tags in BIO format. + + Returns: + sentence (str): the sentence as a string. + spans (list((int, int, str))): list of spans as a 3-tuple of start + position, end position, and entity type. Note that end position + is 1 beyond the actual ending position of the token. + """ + spans = [] + curr, start, end, ent_cls = 0, None, None, None + sentence = " ".join(tokens) + # print("tokens:", tokens) + # print("tags:", tags) + for token, tag in zip(tokens, tags): + if tag == "O": + if ent_cls is not None: + spans.append((start, end, ent_cls)) + start, end, ent_cls = None, None, None + elif tag.startswith("B-"): + ent_cls = tag.split("-")[1] + start = curr + end = curr + len(token) + else: # I-xxx + end += len(token) + 1 + # advance curr + curr += len(token) + 1 + + if ent_cls is not None: + spans.append((start, end, ent_cls)) + + return sentence, spans + + +def spans_to_tokens(sentence, spans, spacy_lm): + """ Convert from sentence-spans format to tokens-tags format. Some NERs + use the sentence-spans format, so we need to transform back and forth. + + Args: + sentence (str): the sentence as a string. + spans (list((int, int, str))): list of spans as a 3-tuple of + start_position, end_position, and entity_type. Note that end + position is 1 beyond actual end position of the token. + spacy_lm: we use SpaCy EN language model to tokenizing the + sentence to generate list of tokens. + + Returns: + tokens (list(str)): list of tokens in sentence + tags (list(str)): list of tags in BIO format. + """ + tokens, tags = [], [] + curr_start, curr_end = 0, 0 + for t in spacy_lm(sentence): + tokens.append(t.text) + curr_end = curr_start + len(t.text) + is_annotated = False + for span_start, span_end, span_cls in spans: + if curr_start == span_start: + tags.append("B-" + span_cls) + is_annotated = True + break + elif curr_start > span_start and curr_end <= span_end: + tags.append("I-" + span_cls) + is_annotated = True + break + else: + continue + if not is_annotated: + tags.append("O") + + curr_start += len(t.text) + 1 + + return tokens, tags + From fdc13c134ad595959a38af46346830da57464906 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 3 Dec 2019 06:14:24 -0800 Subject: [PATCH 28/64] cosmetic change: added line before def --- nerds/test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index d2ae666..03da2b7 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -23,6 +23,7 @@ def test_flatten_and_unflatten_list(): assert_equal(len(y), len(y_unflat), "Reconstructed y (y_unflat) should be identical to y") assert_equal(len(y[0]), len(y_unflat[0]), "Reconstructed y (y_unflat) should be identical to y") + def test_tokens_to_spans(): data, labels = load_data_and_labels("nerds/test/data/example.iob") tokens, tags = data[0], labels[0] @@ -41,6 +42,7 @@ def test_tokens_to_spans(): assert_equal(86, spans[2][1], "spans[2].end should be 86") assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") + def test_spans_to_tokens(): sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." spans = [(0, 11, "PER"), (27, 43, "ORG"), (50, 55, "NORP")] From 805428dab1d6ead6e261c7c236cfff64320eae8a Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 3 Dec 2019 07:59:51 -0800 Subject: [PATCH 29/64] applied token_to_spans and spans_to_tokens in the model code (spacy and dictionary). --- nerds/models/dictionary.py | 45 ++-------------------- nerds/models/spacy.py | 75 ++++-------------------------------- nerds/test/test_spacy_ner.py | 5 ++- nerds/test/test_utils.py | 14 ++++++- nerds/utils.py | 64 +++++++++++++++++++++--------- 5 files changed, 72 insertions(+), 131 deletions(-) diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index d0de654..c4aba24 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -1,9 +1,10 @@ from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, spans_to_tokens import ahocorasick import joblib import os +import spacy log = get_logger() @@ -13,6 +14,7 @@ def __init__(self, entity_label=None): super().__init__(entity_label) self.key = "aho-corasick-dict-ner" self.model = None + self.spacy_lm = spacy.load("en") def fit(self, X, y, @@ -77,7 +79,7 @@ def predict(self, X): # remove subsumed phrases longest_phrases = self._remove_subsumed_matches(matched_phrases, 1) # convert longest matches to IOB format - pred = self._convert_matches_to_iob_tags(tokens, longest_phrases) + _, pred = spans_to_tokens(sent, longest_phrases, self.spacy_lm) predictions.append(pred) return predictions @@ -150,7 +152,6 @@ def _add_if_not_spurious_match(self, start_index, end_index, tag, matched_phrases (list(str)): list of matched phrases, updated in place by function. """ - # print(start_index, end_index, tag, sentence, len(sentence)) if start_index == 0: if end_index + 1 < len(sentence): if sentence[end_index + 1] == " ": @@ -193,41 +194,3 @@ def _remove_subsumed_matches(self, matched_phrases, k): else: longest_matches.append(phrase) return self._remove_subsumed_matches(longest_matches, k+1) - - - def _convert_matches_to_iob_tags(self, tokens, matched_phrases): - """ Merges the longest matches with the original tokens to - produce a list of IOB tags for the sentence. - - Args: - tokens (list(str)): list of tokens for the sentence. - matched_phrase (list((start, end, tag))): list of longest - matched phrase tuples. - - Returns: - iob_tags (list(str)): list of IOB tags, each tag - corresponds to a word token. - """ - iob_tags = [] - curr_offset = 0 - prev_label = "O" - for token in tokens: - start_offset = curr_offset - end_offset = start_offset + len(token) - token_matched = False - matched_label = None - for phrase_start, phrase_end, phrase_label in matched_phrases: - if start_offset >= phrase_start and end_offset <= phrase_end: - token_matched = True - matched_label = phrase_label - break - if token_matched: - iob_tags.append( - "I-" + phrase_label if prev_label == phrase_label - else "B-" + phrase_label) - prev_label = phrase_label - else: - iob_tags.append("O") - prev_label = "O" - curr_offset = end_offset + 1 - return iob_tags diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 9db2131..3a51733 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -1,5 +1,5 @@ from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, spans_to_tokens, tokens_to_spans from spacy.util import minibatch @@ -22,6 +22,7 @@ def __init__(self, entity_label=None): super().__init__(entity_label) self.key = "spacy_ner" self.model = None + self.spacy_lm = spacy.load("en") def fit(self, X, y, @@ -100,8 +101,7 @@ def predict(self, X): for sent_tokens in X: sent = " ".join(sent_tokens) doc = self.model(sent) - entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - sent_preds = self._convert_from_spacy(sent, entities) + sent_preds = self._convert_from_spacy(sent, doc.ents) preds.append(sent_preds) return preds @@ -157,55 +157,8 @@ def _convert_to_spacy(self, tokens, labels): } ) """ - content = " ".join(tokens) - offsets, current_offset = [], 0 - for token, label in zip(tokens, labels): - start_offset = current_offset - end_offset = start_offset + len(token) - if label != "O": - offsets.append((start_offset, end_offset, label.split("-")[-1])) - current_offset = end_offset + 1 # skip space - return (content, {"entities": offsets}) - - - def _entities2dict(self, entities): - """ Convert entities returned from SpaCy into a dictionary keyed by - offset pair. This allows predicted entities to be looked up quickly - and the appropriate tag populated in _convert_from_spacy(). - - Args: - entities (list((begin, end, label))): list of label predictions. - - Return: - entity_dict (dict{(begin, end): label}) - """ - entity_dict = {} - for begin, end, label in entities: - key = ":".join([str(begin), str(end)]) - entity_dict[key] = label - return entity_dict - - - def _tokenize_with_offsets(self, sent): - """ Tokenize a sentence by space, and write out tuples that include - offsets for each token. - - Args: - sent (str): sentence as a string. - - Returns: - offsets (list((start, end), token)): list of tokens with - offset information. - """ - tokens = sent.split() - offsets, curr_offset = [], 0 - for token in tokens: - begin = curr_offset - end = begin + len(token) - key = ":".join([str(begin), str(end)]) - offsets.append((key, token)) - curr_offset = end + 1 - return offsets + sentence, spans = tokens_to_spans(tokens, labels, merged=False) + return (sentence, {"entities": spans}) def _convert_from_spacy(self, sent, entities): @@ -220,19 +173,7 @@ def _convert_from_spacy(self, sent, entities): predictions (list(str)): a list of BIO tags for a single sentence. """ - bio_tags, prev_tag = [], None - entity_dict = self._entities2dict(entities) - for offset_token in self._tokenize_with_offsets(sent): - offset, token = offset_token - if offset in entity_dict: - curr_tag = entity_dict[offset] - if prev_tag is None or prev_tag != curr_tag: - bio_tags.append("B-" + curr_tag) - else: - bio_tags.append("I-" + curr_tag) - else: - curr_tag = "O" - bio_tags.append("O") - prev_tag = curr_tag - return bio_tags + spans = [(e.start_char, e.end_char, e.label_) for e in entities] + tokens, tags = spans_to_tokens(sent, spans, self.spacy_lm, merged=False) + return tags diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py index 0525b44..db1bc37 100644 --- a/nerds/test/test_spacy_ner.py +++ b/nerds/test/test_spacy_ner.py @@ -5,7 +5,7 @@ import shutil -def test_crf_ner(): +def test_spacy_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = SpacyNER() model.fit(X, y) @@ -14,4 +14,5 @@ def test_crf_ner(): y_pred = model_r.predict(X) assert_equal(y, y_pred, "Label and prediction must be equal") assert_equal(1.0, model.score(X, y)) - shutil.rmtree("nerds/test/data/models") \ No newline at end of file + shutil.rmtree("nerds/test/data/models") + # assert_true(False) \ No newline at end of file diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index 03da2b7..3a0c769 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -27,7 +27,7 @@ def test_flatten_and_unflatten_list(): def test_tokens_to_spans(): data, labels = load_data_and_labels("nerds/test/data/example.iob") tokens, tags = data[0], labels[0] - sentence, spans = tokens_to_spans(tokens, tags) + sentence, spans = tokens_to_spans(tokens, tags, merged=True) assert_equal( "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .", sentence, "Sentence reconstruction is incorrect") @@ -46,7 +46,7 @@ def test_tokens_to_spans(): def test_spans_to_tokens(): sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." spans = [(0, 11, "PER"), (27, 43, "ORG"), (50, 55, "NORP")] - tokens, tags = spans_to_tokens(sentence, spans, spacy_lm) + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, merged=True) # reference tokens and tags for comparison data, labels = load_data_and_labels("nerds/test/data/example.iob") ref_tokens, ref_tags = data[1], labels[1] @@ -56,3 +56,13 @@ def test_spans_to_tokens(): assert_equal(len(tags), len(ref_tags), "Number of BIO tags should be identical") for tag, ref_tag in zip(tags, ref_tags): assert_equal(ref_tag, tag, "Tags do not match. {:s} != {:s}".format(ref_tag, tag)) + + +def test_spans_to_tokens_unmerged(): + sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." + spans = [(0, 2, 'PER'), (3, 4, 'PER'), (5, 11, 'PER'), (27, 35, 'ORG'), (36, 37, 'ORG'), (38, 39, 'ORG'), (40, 41, 'ORG'), (42, 43, 'ORG'), (50, 55, 'NORP')] + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, merged=False) + ref_preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] + for ref_pred, pred in zip(ref_preds, tags): + assert_equal(ref_pred, pred, "Tags do not match. {:s} != {:s}".format(ref_pred, pred)) + diff --git a/nerds/utils.py b/nerds/utils.py index 214ca32..e4a63df 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -85,13 +85,16 @@ def unflatten_list(xs_flat, xs_lengths): return xs_unflat -def tokens_to_spans(tokens, tags): +def tokens_to_spans(tokens, tags, merged=True): """ Convert from tokens-tags format to sentence-span format. Some NERs use the sentence-span format, so we need to transform back and forth. Args: tokens (list(str)): list of tokens representing single sentence. tags (list(str)): list of tags in BIO format. + merged (bool): if True, offsets for consecutive tokens of the same + entity type are merged into a single span, else spans are + reported individually. Returns: sentence (str): the sentence as a string. @@ -102,29 +105,37 @@ def tokens_to_spans(tokens, tags): spans = [] curr, start, end, ent_cls = 0, None, None, None sentence = " ".join(tokens) - # print("tokens:", tokens) - # print("tags:", tags) - for token, tag in zip(tokens, tags): - if tag == "O": - if ent_cls is not None: + if merged: + for token, tag in zip(tokens, tags): + if tag == "O": + if ent_cls is not None: + spans.append((start, end, ent_cls)) + start, end, ent_cls = None, None, None + elif tag.startswith("B-"): + ent_cls = tag.split("-")[1] + start = curr + end = curr + len(token) + else: # I-xxx + end += len(token) + 1 + # advance curr + curr += len(token) + 1 + + # handle remaining span + if ent_cls is not None: + spans.append((start, end, ent_cls)) + else: + for token, tag in zip(tokens, tags): + if tag.startswith("B-") or tag.startswith("I-"): + ent_cls = tag.split("-")[1] + start = curr + end = curr + len(token) spans.append((start, end, ent_cls)) - start, end, ent_cls = None, None, None - elif tag.startswith("B-"): - ent_cls = tag.split("-")[1] - start = curr - end = curr + len(token) - else: # I-xxx - end += len(token) + 1 - # advance curr - curr += len(token) + 1 - - if ent_cls is not None: - spans.append((start, end, ent_cls)) + curr += len(token) + 1 return sentence, spans -def spans_to_tokens(sentence, spans, spacy_lm): +def spans_to_tokens(sentence, spans, spacy_lm, merged=True): """ Convert from sentence-spans format to tokens-tags format. Some NERs use the sentence-spans format, so we need to transform back and forth. @@ -135,6 +146,10 @@ def spans_to_tokens(sentence, spans, spacy_lm): position is 1 beyond actual end position of the token. spacy_lm: we use SpaCy EN language model to tokenizing the sentence to generate list of tokens. + merged (bool): if True, indicates that spans are merged (ie, the + are multi-word spans). Otherwise, indicates that spans are + single tokens, and consecutive entries of the same class needs + to be transformed, ie. (B-x, B-x) should become (B-x, I-x) Returns: tokens (list(str)): list of tokens in sentence @@ -162,5 +177,16 @@ def spans_to_tokens(sentence, spans, spacy_lm): curr_start += len(t.text) + 1 + # handle consecutive class labels if merged=False + if not merged: + prev_tag, merged_tags = None, [] + for tag in tags: + if prev_tag is None or prev_tag != tag: + merged_tags.append(tag) + else: + merged_tags.append(tag.replace("B-", "I-")) + prev_tag = tag + tags = merged_tags + return tokens, tags From 07db37b6d580a27ba23213c210ea692dbeda5819 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Thu, 5 Dec 2019 07:41:27 -0800 Subject: [PATCH 30/64] updated Anago dev link in setup.py and added PyData LA slides link. --- README.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7702ea0..62ed01f 100644 --- a/README.md +++ b/README.md @@ -131,3 +131,6 @@ The [CONTRIBUTING.md file](docs/CONTRIBUTING.md) lists contributors who have con The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that were made in this fork. +# Talks and Blogs + +* \[slides\] [Slides for talk at PyData LA 2019](https://www.slideshare.net/sujitpal/building-named-entity-recognition-models-efficiently-using-nerds). diff --git a/setup.py b/setup.py index ee1cbf0..296bce6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ author="Elsevier Content & Innovation", install_requires=[ 'allennlp', - 'anago', + 'anago @ git+https://github.com/Hironsan/anago.git', 'future', 'h5py', 'hyperopt', From ac819fd3a4cf58ea615e377916b86a4b55acd672 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 13 Dec 2019 15:58:49 -0800 Subject: [PATCH 31/64] renaming merged parameter for utils.tokens_to_spans and utils.spans_to_tokens allow_multiword_spans and spans_are_multiword respectively for clarity. --- nerds/models/spacy.py | 4 ++-- nerds/test/test_utils.py | 26 ++++++++++++++++++++++---- nerds/utils.py | 23 +++++++++++------------ 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 3a51733..907fa6a 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -157,7 +157,7 @@ def _convert_to_spacy(self, tokens, labels): } ) """ - sentence, spans = tokens_to_spans(tokens, labels, merged=False) + sentence, spans = tokens_to_spans(tokens, labels, allow_multiword_spans=False) return (sentence, {"entities": spans}) @@ -174,6 +174,6 @@ def _convert_from_spacy(self, sent, entities): sentence. """ spans = [(e.start_char, e.end_char, e.label_) for e in entities] - tokens, tags = spans_to_tokens(sent, spans, self.spacy_lm, merged=False) + tokens, tags = spans_to_tokens(sent, spans, self.spacy_lm, spans_are_multiword=False) return tags diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index 3a0c769..c1f2947 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -27,7 +27,7 @@ def test_flatten_and_unflatten_list(): def test_tokens_to_spans(): data, labels = load_data_and_labels("nerds/test/data/example.iob") tokens, tags = data[0], labels[0] - sentence, spans = tokens_to_spans(tokens, tags, merged=True) + sentence, spans = tokens_to_spans(tokens, tags, allow_multiword_spans=True) assert_equal( "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .", sentence, "Sentence reconstruction is incorrect") @@ -43,10 +43,28 @@ def test_tokens_to_spans(): assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") +def test_tokens_to_spans_no_multiword_spans(): + data, labels = load_data_and_labels("nerds/test/data/example.iob") + tokens, tags = data[0], labels[0] + # convert to single token per span format + tags = ["O" if t == "O" else "B-" + t.split("-")[1] for t in tags] + sentence, spans = tokens_to_spans(tokens, tags, allow_multiword_spans=False) + assert_equal(8, len(spans), "Should be exactly 8 spans") + assert_equal(0, spans[0][0], "spans[0].start should be 0") + assert_equal(6, spans[0][1], "spans[0].end should be 6") + assert_equal("PER", spans[0][2], "spans[0].cls should be PER") + assert_equal(16, spans[2][0], "spans[2].start should be 16") + assert_equal(18, spans[2][1], "spans[2].end should be 18") + assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") + assert_equal(78, spans[5][0], "spans[5].start should be 78") + assert_equal(81, spans[5][1], "spans[5].end should be 81") + assert_equal("DATE", spans[5][2], "spans[5].cls should be DATE") + + def test_spans_to_tokens(): sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." spans = [(0, 11, "PER"), (27, 43, "ORG"), (50, 55, "NORP")] - tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, merged=True) + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True) # reference tokens and tags for comparison data, labels = load_data_and_labels("nerds/test/data/example.iob") ref_tokens, ref_tags = data[1], labels[1] @@ -58,10 +76,10 @@ def test_spans_to_tokens(): assert_equal(ref_tag, tag, "Tags do not match. {:s} != {:s}".format(ref_tag, tag)) -def test_spans_to_tokens_unmerged(): +def test_spans_to_tokens_no_multiword_spans(): sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." spans = [(0, 2, 'PER'), (3, 4, 'PER'), (5, 11, 'PER'), (27, 35, 'ORG'), (36, 37, 'ORG'), (38, 39, 'ORG'), (40, 41, 'ORG'), (42, 43, 'ORG'), (50, 55, 'NORP')] - tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, merged=False) + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=False) ref_preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] for ref_pred, pred in zip(ref_preds, tags): assert_equal(ref_pred, pred, "Tags do not match. {:s} != {:s}".format(ref_pred, pred)) diff --git a/nerds/utils.py b/nerds/utils.py index e4a63df..1296279 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -85,16 +85,16 @@ def unflatten_list(xs_flat, xs_lengths): return xs_unflat -def tokens_to_spans(tokens, tags, merged=True): +def tokens_to_spans(tokens, tags, allow_multiword_spans=True): """ Convert from tokens-tags format to sentence-span format. Some NERs use the sentence-span format, so we need to transform back and forth. Args: tokens (list(str)): list of tokens representing single sentence. tags (list(str)): list of tags in BIO format. - merged (bool): if True, offsets for consecutive tokens of the same - entity type are merged into a single span, else spans are - reported individually. + allow_multiword_spans (bool): if True, offsets for consecutive + tokens of the same entity type are merged into a single span, + otherwise tokens are reported as individual spans. Returns: sentence (str): the sentence as a string. @@ -105,7 +105,7 @@ def tokens_to_spans(tokens, tags, merged=True): spans = [] curr, start, end, ent_cls = 0, None, None, None sentence = " ".join(tokens) - if merged: + if allow_multiword_spans: for token, tag in zip(tokens, tags): if tag == "O": if ent_cls is not None: @@ -135,7 +135,7 @@ def tokens_to_spans(tokens, tags, merged=True): return sentence, spans -def spans_to_tokens(sentence, spans, spacy_lm, merged=True): +def spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True): """ Convert from sentence-spans format to tokens-tags format. Some NERs use the sentence-spans format, so we need to transform back and forth. @@ -146,10 +146,9 @@ def spans_to_tokens(sentence, spans, spacy_lm, merged=True): position is 1 beyond actual end position of the token. spacy_lm: we use SpaCy EN language model to tokenizing the sentence to generate list of tokens. - merged (bool): if True, indicates that spans are merged (ie, the - are multi-word spans). Otherwise, indicates that spans are - single tokens, and consecutive entries of the same class needs - to be transformed, ie. (B-x, B-x) should become (B-x, I-x) + spans_are_multiword (bool): if True, indicates that spans can + be multi-word spans), so consecutive entries of the same class + should be transformed, ie. (B-x, B-x) should become (B-x, I-x). Returns: tokens (list(str)): list of tokens in sentence @@ -177,8 +176,8 @@ def spans_to_tokens(sentence, spans, spacy_lm, merged=True): curr_start += len(t.text) + 1 - # handle consecutive class labels if merged=False - if not merged: + # handle consecutive class labels if spans were single word spans + if not spans_are_multiword: prev_tag, merged_tags = None, [] for tag in tags: if prev_tag is None or prev_tag != tag: From 124af0bfca56a4113de203dd0407330626e98362 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 13 Dec 2019 17:48:14 -0800 Subject: [PATCH 32/64] BRAT to IOB converter --- converters/brat2iob.py | 178 +++++++++++++++++++++++++++++++++++++++++ nerds/utils.py | 6 +- 2 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 converters/brat2iob.py diff --git a/converters/brat2iob.py b/converters/brat2iob.py new file mode 100644 index 0000000..a3a6e52 --- /dev/null +++ b/converters/brat2iob.py @@ -0,0 +1,178 @@ +import argparse +import operator +import os +import re +import shutil +import spacy +import tempfile + +from nerds.utils import spans_to_tokens, get_logger + +def segment_text_to_sentences(text_file, sentence_splitter): + """ Segment text into sentences. Text is provided by BRAT in .txt + file. + + Args: + text_file (str): the full path to the BRAT .txt file. + sentence_splitter (spacy LM): SpaCy EN language model. + + Returns: + sentences (list((int, int, str))): list of sentence spans. + Spans are triples of (start_offset, end_offset, text), + where offset is relative to the text. + """ + sentences = [] + ftext = open(text_file, "r") + for line in ftext: + splits = sentence_splitter(line.strip()) + for sent in splits.sents: + sentences.append((sent.start_char, sent.end_char, sent.text)) + ftext.close() + return sentences + + +def parse_text_annotations(ann_file): + """ Parses BRAT annotations provided in the .ann file and converts them + to annotation spans of (start_position, end_position, entity_class). + + Args: + ann_file (str): full path to the BRAT .ann file. + + Returns: + annotations (list((int, int, str))): list of annotation spans. + Spans are triples of (start_offset, end_offset, entity_class) + where offset is relative to the text. + """ + annots = [] + fann = open(ann_file, "r") + for line in fann: + cols = re.split(r"\s+", line.strip()) + if not cols[0].startswith("T"): + continue + annots.append((int(cols[2]), int(cols[3]), cols[1])) + fann.close() + return annots + + +def apply_annotations(sentences, annotations, tokenizer): + """ Apply annotation spans to the sentence spans to create a list of tokens + and tags. + + Args: + sentences (list((int, int, str))): list of sentence spans. + annotations (list((int, int, str))): list of annotation spans. + tokenizer (spacy LM): SpaCy EN language model. + + Returns: + tokens_tags_list (list((list(str), list(str)))): list of list of token + tag pairs. Each list of token-tag pairs corresponds to a single + sentence. + """ + tokens_tags_list = [] + for sent_start, sent_end, sent_text in sentences: + sent_annots = [a for a in annotations if a[0] >= sent_start and a[1] <= sent_end] + # convert document offsets to sentence offsets + sent_annots = [(s[0] - sent_start, s[1] - sent_start, s[2]) for s in sent_annots] + tokens, tags = spans_to_tokens(sent_text, sent_annots, tokenizer) + tokens_tags_list.append(zip(tokens, tags)) + return tokens_tags_list + + +def convert_brat_to_iob(input_dir, output_file, nlp): + """ Convenience Convertor function. + + Args: + input_dir (str): the directory where the BRAT .txt and .ann files + are located. + output_file (str): the full path name of file to write output in + IOB format to. + nlp (SpaCy LM): reference to the SpaCy EN model. + + Returns: + None. + """ + fout = open(output_file, "w") + for text_file in os.listdir(input_dir): + # only process .txt and .ann pairs in specified directory + if not text_file.endswith(".txt"): + continue + annot_file = text_file[:-4] + ".ann" + if not os.path.exists(os.path.join(input_dir, annot_file)): + # do not process file if no corresponding .ann file + continue + # process file pair + logger.info("Processing file: {:s}".format(text_file)) + sentences = segment_text_to_sentences(os.path.join(input_dir, text_file), nlp) + annotations = parse_text_annotations(os.path.join(input_dir, annot_file)) + tokens_tags_list = apply_annotations(sentences, annotations, nlp) + for tokens_tags in tokens_tags_list: + for token, tag in tokens_tags: + fout.write("{:s}\t{:s}\n".format(token, tag)) + fout.write("\n") + + fout.close() + + +def do_self_test(nlp): + """ Simple self-test with small dataset to prove that this works okay. """ + text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director, Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group." + annotations = [ + "T1 PER 0 13 Pierre Vinken", + "T2 PER 86 96 Mr. Vinken", + "T3 DATE 15 27 61 years old", + "T4 DATE 77 84 Nov. 29", + "T5 ORG 112 125 Elsevier N.V.", + "T6 NORP 131 136 Dutch" + ] + input_dir = tempfile.mkdtemp(dir="/tmp") + ftext = open(os.path.join(input_dir, "test.txt"), "w") + ftext.write(text) + ftext.close() + fann = open(os.path.join(input_dir, "test.ann"), "w") + for line in annotations: + fann.write(line + "\n") + fann.close() + output_file = os.path.join(input_dir, "test.iob") + convert_brat_to_iob(input_dir, output_file, nlp) + fout = open(output_file, "r") + for line in fout: + logger.warn(line.strip()) + shutil.rmtree(input_dir) + + +################################ main ################################ +# +# usage: brat2iob.py [-h] [-i INPUT_DIR] [-o OUTPUT_FILE] [-t] +# Script to convert BRAT annotations to IOB (NERDS) format. +# optional arguments: +# -h, --help show this help message and exit +# -i INPUT_DIR, --input_dir INPUT_DIR +# Directory to store BRAT .txt and .ann files. +# -o OUTPUT_FILE, --output_file OUTPUT_FILE +# Output file to write IOB output to. +# -t, --test Runs self test. +###################################################################### + +parser = argparse.ArgumentParser( + description="Script to convert BRAT annotations to IOB (NERDS) format.") +parser.add_argument("-i", "--input_dir", help="Directory to store BRAT .txt and .ann files.") +parser.add_argument("-o", "--output_file", help="Output file to write IOB output to.") +parser.add_argument("-t", "--test", help="Runs self test.", action="store_true") +args = parser.parse_args() + +logger = get_logger() + +input_dir = args.input_dir +output_file = args.output_file +self_test = args.test + +nlp = spacy.load("en") + +if self_test: + logger.info("Executing self test...") + do_self_test(nlp) +else: + logger.info("Reading BRAT .txt and .ann files from: {:s}".format(input_dir)) + logger.info("Writing IOB tokens/tags to file: {:s}".format(output_file)) + convert_brat_to_iob(input_dir, output_file, nlp) + diff --git a/nerds/utils.py b/nerds/utils.py index 1296279..d9e8bfb 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -174,7 +174,11 @@ def spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True): if not is_annotated: tags.append("O") - curr_start += len(t.text) + 1 + # advance pointer across current word + curr_start += len(t.text) + # advance pointer across space if next token separated by space + if curr_start < len(sentence) and sentence[curr_start] == " ": + curr_start += 1 # handle consecutive class labels if spans were single word spans if not spans_are_multiword: From 37864aff65185b429b3da65014a0ebd82bde4e02 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 12:42:50 -0800 Subject: [PATCH 33/64] moving fit keywords to constructor parameters in keeping with scikit-learn recommendations for building custom Estimators. --- docs/CHANGES.md | 15 ++-- nerds/models/base.py | 7 +- nerds/models/bilstm.py | 116 ++++++++++++++++-------------- nerds/models/crf.py | 98 ++++++++++++++----------- nerds/models/dictionary.py | 63 +++++++++------- nerds/models/ensemble.py | 62 ++++++++-------- nerds/models/spacy.py | 72 ++++++++++--------- nerds/test/test_bilstm_ner.py | 4 +- nerds/test/test_crf_ner.py | 15 ++++ nerds/test/test_dictionary_ner.py | 4 +- nerds/test/test_elmo_ner.py | 4 +- nerds/test/test_ensemble_ner.py | 12 ++-- 12 files changed, 268 insertions(+), 204 deletions(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index 2fc5165..1cd3751 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -26,19 +26,24 @@ * works against most recent Anago API changes * does not give timestep size errors * ElmoNER - * New, available in Anago, same API as Anago's BiLSTMCRF + * New, available in Anago DEV repo, same API as Anago's BiLSTMCRF * EnsembleNER * simpler interface * weights from each classifier * joblib.Parallel -- improve? * Utils * Thin wrapper over anago's `load_data_and_labels` - * Converter for output so scikit-learn metrics can be used. - -* Other stuff + * `flatten_list` and `unflatten_list` to convert between `list(list(str))` produced by NERDS models and `list(str)` required by `sklearn`, scikit-learn metrics can be used. + * `tokens_to_spans` and `spans_to_tokens` -- utility functions to convert between sentence and span format (used by the other 2 of 5 provided models) from and to BIO format. +* Converters + * Converter from BRAT (.txt and .ann) to IOB format +* Miscellaneous * remove deprecated sklearn.external.joblib -> joblib + * True Scikit-Learn interoperability -- some progress has been made, parameters are now provided in constructor rather than as keywords in the `fit()` call. However, `check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are list(list(str)) rather than list(str). ## Planned -* Scikit-Learn interoperability. +* Convert Docs to numpy docstring format -- https://numpydoc.readthedocs.io/en/latest/format.html * BERT Transformer based NER +* FLAIR based NER + diff --git a/nerds/models/base.py b/nerds/models/base.py index afa3b79..798b5e3 100644 --- a/nerds/models/base.py +++ b/nerds/models/base.py @@ -9,10 +9,9 @@ class NERModel(BaseEstimator, ClassifierMixin): This is the core class responsible for training models that perform named entity recognition, and retrieving named entities from documents. """ - def __init__(self, entity_label=None): - self.entity_label = entity_label - self.key = "" # To be added in subclass. - + def __init__(self): + pass + def fit(self, X, y): """ Train the model using data (X) and labels (y). Return trained model. """ diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py index 2d6780b..8abcbc8 100644 --- a/nerds/models/bilstm.py +++ b/nerds/models/bilstm.py @@ -17,22 +17,7 @@ class BiLstmCrfNER(NERModel): - def __init__(self, entity_label=None): - """ Build a Anago Bi-LSTM CRF model. - - Args: - entity_label: label for single entity NER, default None - """ - super().__init__(entity_label) - self.key = "anago_bilstmcrf" - # populated by fit() and load(), expected by save() and transform() - self.preprocessor = None - self.model = None - self.trainer = None - self.tagger = None - - - def fit(self, X, y, + def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, @@ -44,12 +29,12 @@ def fit(self, X, y, use_crf=True, batch_size=16, learning_rate=0.001, - num_epochs=10): - """ Trains the NER model. Input is list of AnnotatedDocuments. + max_iter=10): + """ Construct a BiLSTM-CRF NER model. Model is augmented with character + level embeddings as well as word embeddings by default. Implementation + is provided by the Anago project. Args: - X list(list(str)): list of list of tokens - y list(list(str)): list of list of BIO tags word_embedding_dim (int): word embedding dimensions. char_embedding_dim (int): character embedding dimensions. word_lstm_size (int): character LSTM feature extractor output dimensions. @@ -59,40 +44,67 @@ def fit(self, X, y, embeddings (numpy array): word embedding matrix. use_char (boolean): add char feature. use_crf (boolean): use crf as last layer. - batch_size training batch size. - learning_rate learning rate for Adam optimizer. - num_epochs number of epochs of training. + batch_size (int): training batch size. + learning_rate (float): learning rate for Adam optimizer. + num_epochs (int): number of epochs of training. + """ + super().__init__() + self.word_embedding_dim = word_embedding_dim + self.char_embedding_dim = char_embedding_dim + self.word_lstm_size = word_lstm_size + self.char_lstm_size = char_lstm_size + self.fc_dim = fc_dim + self.dropout = dropout + self.embedding = None + self.use_char = True + self.use_crf = True + self.batch_size = batch_size + self.learning_rate = learning_rate + self.max_iter = max_iter + # populated by fit() and load(), expected by save() and transform() + self.preprocessor_ = None + self.model_ = None + self.trainer_ = None + self.tagger_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of AnnotatedDocuments. + + Args: + X list(list(str)): list of list of tokens + y list(list(str)): list of list of BIO tags """ log.info("Preprocessing dataset...") - self.preprocessor = IndexTransformer(use_char=use_char) - self.preprocessor.fit(X, y) + self.preprocessor_ = IndexTransformer(use_char=self.use_char) + self.preprocessor_.fit(X, y) log.info("Building model...") - self.model = BiLSTMCRF( - char_embedding_dim=char_embedding_dim, - word_embedding_dim=word_embedding_dim, - char_lstm_size=char_lstm_size, - word_lstm_size=word_lstm_size, - char_vocab_size=self.preprocessor.char_vocab_size, - word_vocab_size=self.preprocessor.word_vocab_size, - num_labels=self.preprocessor.label_size, - dropout=dropout, - use_char=use_char, - use_crf=use_crf) - self.model, loss = self.model.build() - optimizer = Adam(lr=learning_rate) - self.model.compile(loss=loss, optimizer=optimizer) - self.model.summary() + self.model_ = BiLSTMCRF( + char_embedding_dim=self.char_embedding_dim, + word_embedding_dim=self.word_embedding_dim, + char_lstm_size=self.char_lstm_size, + word_lstm_size=self.word_lstm_size, + char_vocab_size=self.preprocessor_.char_vocab_size, + word_vocab_size=self.preprocessor_.word_vocab_size, + num_labels=self.preprocessor_.label_size, + dropout=self.dropout, + use_char=self.use_char, + use_crf=self.use_crf) + self.model_, loss = self.model_.build() + optimizer = Adam(lr=self.learning_rate) + self.model_.compile(loss=loss, optimizer=optimizer) + self.model_.summary() log.info('Training the model...') - self.trainer = Trainer(self.model, preprocessor=self.preprocessor) + self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) - self.trainer.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, - batch_size=batch_size, epochs=num_epochs) + self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=self.batch_size, epochs=self.max_iter) - self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self @@ -105,11 +117,11 @@ def predict(self, X): Returns: y list(list(str)): list of list of predicted BIO tags. """ - if self.tagger is None: + if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") log.info("Predicting from model...") - ypreds = [self.tagger.predict(" ".join(x)) for x in X] + ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds @@ -121,7 +133,7 @@ def save(self, dirpath): Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. """ - if self.model is None or self.preprocessor is None: + if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") if not os.path.exists(dirpath): @@ -131,8 +143,8 @@ def save(self, dirpath): params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") - save_model(self.model, weights_file, params_file) - self.preprocessor.save(preprocessor_file) + save_model(self.model_, weights_file, params_file) + self.preprocessor_.save(preprocessor_file) def load(self, dirpath): @@ -153,9 +165,9 @@ def load(self, dirpath): os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") - self.model = load_model(weights_file, params_file) - self.preprocessor = IndexTransformer.load(preprocessor_file) - self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + self.model_ = load_model(weights_file, params_file) + self.preprocessor_ = IndexTransformer.load(preprocessor_file) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self diff --git a/nerds/models/crf.py b/nerds/models/crf.py index 2550345..69bf656 100644 --- a/nerds/models/crf.py +++ b/nerds/models/crf.py @@ -11,23 +11,36 @@ class CrfNER(NERModel): - def __init__(self, entity_label=None): - """ Build a sklearn.crfsuite.CRF CRF model + def __init__(self, + max_iter=100, + c1=0.1, + c2=0.1, + featurizer=None): + """ Construct a Conditional Random Fields (CRF) based NER. Implementation + of CRF NER is provided by sklearn.crfsuite.CRF. Args: - entity_label (str): label for single entity NER, default None + max_iter (int, default 100): maximum number of iterations to run + CRF training + c1 (float, default 0.1): L1 regularization coefficient. + c2 (float, default 0.1): L2 regularization coefficient. + featurizer (function, default None): if None, the default featurizer + _sent2features() is used to convert list of tokens for each + sentence to a list of features, where each feature is a dictionary + of name-value pairs. For custom features, a featurizer function must + be provided that takes in a list of tokens (sentence) and returns a + list of features. """ - super().__init__(entity_label) - self.key = "crfsuite_crf" - self.nlp = None - self.model = None + super().__init__() + self.max_iter = max_iter + self.c1 = c1 + self.c2 = c2 + self.featurizer = featurizer + self._nlp = None + self.model_ = None - def fit(self, X, y, - is_featurized=False, - max_iterations=100, - c1=0.1, - c2=0.1): + def fit(self, X, y): """ Build feature vectors and train CRF model. Wrapper for sklearn_crfsuite.CRF model. The underlying model takes many parameters (for full list (and possible future enhancement), see @@ -39,35 +52,28 @@ def fit(self, X, y, of words, and features are a list of word features, each word feature is a dictionary of name-value pairs. y (list(list(str))): list of list of BIO tags. - is_featurized (bool, default False): if True, X is a list of list - of features, else X is a list of list of words. - max_iterations (int, default 100): maximum number of - iterations to run CRF training - c1 (float, default 0.1): L1 regularization coefficient. - c2 (float, default 0.1): L2 regularization coefficient. """ - if not is_featurized: - log.info("Generating features for {:d} samples...".format(len(X))) - if self.nlp is None: - self.nlp = self._load_language_model() - features = [self._sent2features(sent, self.nlp) for sent in X] - + if self.featurizer is None: + features = [self._sent2features(sent) for sent in X] + else: + features = [self.featurizer(sent) for sent in X] + log.info("Building model...") - self.model = sklearn_crfsuite.CRF( + self.model_ = sklearn_crfsuite.CRF( algorithm="lbfgs", - c1=c1, - c2=c2, - max_iterations=max_iterations, + c1=self.c1, + c2=self.c2, + max_iterations=self.max_iter, all_possible_transitions=True, verbose=True) log.info("Training model...") - self.model.fit(X if is_featurized else features, y) + self.model_.fit(features, y) return self - def predict(self, X, is_featurized=False): + def predict(self, X): """ Predicts using trained CRF model. Args: @@ -78,16 +84,15 @@ def predict(self, X, is_featurized=False): Returns: y (list(list(str))): list of list of predicted BIO tags. """ - if self.model is None: + if self.model_ is None: raise ValueError("CRF model not found, run fit() to train or load() pre-trained model") - if not is_featurized: - log.info("Generating features for {:d} samples".format(len(X))) - if self.nlp is None: - self.nlp = self._load_language_model() - features = [self._sent2features(sent, self.nlp) for sent in X] + if self.featurizer is None: + features = [self._sent2features(sent) for sent in X] + else: + features = [self.featurizer(sent) for sent in X] - return self.model.predict(X if is_featurized else features) + return self.model_.predict(features) def save(self, dirpath): @@ -96,14 +101,14 @@ def save(self, dirpath): Args: dirpath (str): path to model directory. """ - if self.model is None: + if self.model_ is None: raise ValueError("No model to save, run fit() to train or load() pre-trained model") if not os.path.exists(dirpath): os.makedirs(dirpath) model_file = os.path.join(dirpath, "crf-model.pkl") - joblib.dump(self.model, model_file) + joblib.dump(self.model_, model_file) def load(self, dirpath): @@ -118,7 +123,7 @@ def load(self, dirpath): if not os.path.exists(model_file): raise ValueError("No CRF model to load at {:s}, exiting.".format(model_file)) - self.model = joblib.load(model_file) + self.model_ = joblib.load(model_file) return self @@ -126,11 +131,20 @@ def _load_language_model(self): return spacy.load("en") - def _sent2features(self, sent, nlp): + def _sent2features(self, sent): """ Converts a list of tokens to a list of features for CRF. Each feature is a dictionary of feature name value pairs. + + Args: + sent (list(str)): a list of tokens representing a sentence. + + Returns: + feats (list(dict(str, obj))): a list of features, where each + feature is a dictionary of name-value pairs. """ - doc = nlp(" ".join(sent)) + if self._nlp is None: + self._nlp = self._load_language_model() + doc = self._nlp(" ".join(sent)) postags = [token.pos_ for token in doc] features = [self._word2featdict(sent, postags, i) for i in range(len(sent))] return features diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index c4aba24..187d079 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -10,15 +10,25 @@ class DictionaryNER(NERModel): - def __init__(self, entity_label=None): - super().__init__(entity_label) - self.key = "aho-corasick-dict-ner" - self.model = None - self.spacy_lm = spacy.load("en") + def __init__(self, from_dictionary=False): + """ Construct a DictionaryNER object. The DictionaryNER functions + like a gazetteer, and is based on the Aho-Corasick algorithm + implemented by the pyAhoCorasick package. + + Args: + from_dictionary (bool, default False): if True, input is + multi-word phrases representing entities, otherwise + input is potentially multi-word phrases annotated as + a sequence of (token, tag) pairs. See fit(X, y) for + more information. + """ + super().__init__() + self.from_dictionary = from_dictionary + self._spacy_lm = None + self.model_ = None - def fit(self, X, y, - combine_tokens=True): + def fit(self, X, y): """ Build dictionary of phrases of different entity types. Args: @@ -42,36 +52,29 @@ def fit(self, X, y, is True, then labels are IOB tags. If combine_tokens is False, labels are entity types (without leading B and I), and without any O labels. - - combine_tokens (bool, default True): if True, input comes from - standard training set, and an additional step to chunk - phrases is needed. If False, input comes from a dictionary - with phrase chunking already done. """ - self.model = ahocorasick.Automaton() - - if combine_tokens: + self.model_ = ahocorasick.Automaton() + if self.from_dictionary: + for token, label in zip(X, y): + self.model_.add_word(token, (label, token)) + else: for idx, (tokens, labels) in enumerate(zip(X, y)): phrase_tokens, phrase_labels = self._combine_tokens(tokens, labels) for phrase, label in zip(phrase_tokens, phrase_labels): - self.model.add_word(phrase, (label, phrase)) - else: - for token, label in zip(X, y): - self.model.add_word(token, (label, token)) - self.model.make_automaton() - + self.model_.add_word(phrase, (label, phrase)) + self.model_.make_automaton() return self def predict(self, X): - if self.model is None: + if self.model_ is None: raise ValueError("No model found, use fit() to train or load() pretrained.") predictions = [] for tokens in X: sent = " ".join(tokens) matched_phrases = [] - for end_index, (tag, phrase) in self.model.iter(sent): + for end_index, (tag, phrase) in self.model_.iter(sent): start_index = end_index - len(phrase) + 1 # filter out spurious matches on partial words self._add_if_not_spurious_match( @@ -79,14 +82,16 @@ def predict(self, X): # remove subsumed phrases longest_phrases = self._remove_subsumed_matches(matched_phrases, 1) # convert longest matches to IOB format - _, pred = spans_to_tokens(sent, longest_phrases, self.spacy_lm) + if self._spacy_lm is None: + self._spacy_lm = self._load_language_model() + _, pred = spans_to_tokens(sent, longest_phrases, self._spacy_lm) predictions.append(pred) return predictions def save(self, dirpath=None): - if self.model is None: + if self.model_ is None: raise ValueError("No model found, use fit() to train or load() pretrained.") if not os.path.exists(dirpath): @@ -94,7 +99,7 @@ def save(self, dirpath=None): log.info("Saving model...") model_file = os.path.join(dirpath, "dictionary-ner.pkl") - joblib.dump(self.model, model_file) + joblib.dump(self.model_, model_file) def load(self, dirpath=None): @@ -102,10 +107,14 @@ def load(self, dirpath=None): if not os.path.exists(model_file): raise ValueError("Saved model {:s} not found.".format(model_file)) - self.model = joblib.load(model_file) + self.model_ = joblib.load(model_file) return self + def _load_language_model(self): + return spacy.load("en") + + def _combine_tokens(self, tokens, labels): """ Combine consecutive word tokens for some given entity type to create phrase tokens. diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index 0ae6b8f..9337b12 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -9,23 +9,13 @@ class EnsembleNER(NERModel): - def __init__(self, entity_label=None): - super().__init__(entity_label) - self.key = "voting_ensemble" - # these are set by fit and load, required by predict and save - self.estimators = None - self.weights = None - - - def fit(self, X, y, + def __init__(self, estimators=[], weights=None, is_pretrained=False): - """ Train ensemble by training underlying NERModels. + """ Constructor for Voting Ensemble NER. Args: - X (list(list(str))): list of list of tokens. - y (list(list(str))): list of list of BIO tags. estimators (list(NERModel, dict(str,obj)), default empty): list of (NERModels, fit_param) pairs to use in the ensemble. The fit_param is a flat dictionary of named arguments used in @@ -34,31 +24,47 @@ def fit(self, X, y, apply to predicted class labels from each estimator. If None, then predictions from all estimators are treated equally. + """ - if estimators is None or len(estimators) == 0: + super().__init__() + # these are set by fit and load, required by predict and save + self.estimators = estimators + self.weights = weights + self.is_pretrained=is_pretrained + + + def fit(self, X, y): + """ Train ensemble by training underlying NERModels. + + Args: + X (list(list(str))): list of list of tokens. + y (list(list(str))): list of list of BIO tags. + """ + if self.estimators is None or len(self.estimators) == 0: raise ValueError("Non-empty list of estimators required to fit ensemble.") + if self.weights is None: + self.weights = [1] * len(self.estimators) else: - self.estimators = estimators - if weights is None: - self.weights = [1] * len(estimators) - else: - if len(estimators) != len(weights): + if len(self.estimators) != len(self.weights): raise ValueError("Number of weights must correspond to number of estimators.") - else: - self.weights = weights - if is_pretrained: + if self.is_pretrained: return self # various pickling errors are seen if we use joblib.Parallel to fit # in parallel across multiple processors. Since normal usage should # not involve calling fit(), this is okay to keep as sequential. - fitted_estimators = [self._fit_estimator(clf, X, y, fit_params) - for clf, fit_params in self.estimators] - - self.estimators = [(fitted, params) for (clf, params), fitted in + fitted_estimators = [self._fit_estimator(clf, X, y) + for name, clf in self.estimators] + self.estimators = [(name, fitted) for (name, clf), fitted in zip(self.estimators, fitted_estimators)] + # fitted_estimators = joblib.Parallel(n_jobs=-1, backend="threading")( + # map(lambda clf: joblib.delayed(self._fit_estimator(clf[1], X, y)), + # self.estimators)) + # self.estimators = [(name, fitted) for (name, clf), fitted + # in zip(self.estimators, fitted_estimators)] + return self @@ -78,7 +84,7 @@ def predict(self, X): raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") predictions = [] - for clf, _ in self.estimators: + for _, clf in self.estimators: predictions.append(clf.predict(X)) return self._vote(predictions) @@ -92,8 +98,8 @@ def save(model_dirpath): raise NotImplementedError() - def _fit_estimator(self, estimator, X, y, fit_params): - fitted_estimator = estimator.fit(X, y, **fit_params) + def _fit_estimator(self, estimator, X, y): + fitted_estimator = estimator.fit(X, y) return fitted_estimator diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 907fa6a..5474c43 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -13,43 +13,46 @@ class SpacyNER(NERModel): - def __init__(self, entity_label=None): - """ Build a SpaCy EntityRecognizer NER model. + def __init__(self, + dropout=0.1, + max_iter=20, + batch_size=32): + """ Construct a SpaCy based NER. The SpaCy library provides an EntityRecognizer + class to do Named Entity Recognition. Args: - entity_label (str, default None): entity label for single class NER. + dropout (float): rate of dropout during training between 0 and 1. + max_iter (int): number of epochs of training. + batch_size (int): batch size to use during training + """ - super().__init__(entity_label) - self.key = "spacy_ner" - self.model = None - self.spacy_lm = spacy.load("en") + super().__init__() + self.dropout = dropout + self.max_iter = max_iter + self.batch_size = batch_size + self._spacy_lm = spacy.load("en") + self.model_ = None - def fit(self, X, y, - num_epochs=20, - dropout=0.1, - batch_size=32): + def fit(self, X, y): """ Trains the SpaCy NER model. Args: X (list(list(str))): list of tokenized sentences, or list of list of tokens. y (list(list(str))): list of list of BIO tags. - num_epochs (int): number of epochs of training. - dropout (float): rate of dropout during training between 0 and 1. - batch_size (int): batch size to use during training """ log.info("Reformatting data to SpaCy format...") features = [self._convert_to_spacy(tokens, labels) for tokens, labels in zip(X, y)] log.info("Building SpaCy NER model...") - self.model = spacy.blank("en") - if "ner" not in self.model.pipe_names: - ner = self.model.create_pipe("ner") - self.model.add_pipe(ner) + self.model_ = spacy.blank("en") + if "ner" not in self.model_.pipe_names: + ner = self.model_.create_pipe("ner") + self.model_.add_pipe(ner) else: - ner = self.model.get_pipe("ner") + ner = self.model_.get_pipe("ner") unique_labels = set() for _, annotations in features: @@ -58,24 +61,22 @@ def fit(self, X, y, ner.add_label(ent[2]) for label in list(unique_labels): - ner.add_label("B-" + label) - ner.add_label("I-" + label) - ner.add_label("O") + ner.add_label(label) log.info("Training SpaCy NER model...") - optimizer = self.model.begin_training() + optimizer = self.model_.begin_training() - other_pipes = [p for p in self.model.pipe_names if p != "ner"] - with self.model.disable_pipes(*other_pipes): - for it in range(num_epochs): + other_pipes = [p for p in self.model_.pipe_names if p != "ner"] + with self.model_.disable_pipes(*other_pipes): + for it in range(self.max_iter): random.shuffle(features) losses = {} - batches = minibatch(features, size=batch_size) + batches = minibatch(features, size=self.batch_size) for batch in batches: texts, annotations = zip(*batch) - self.model.update(texts, annotations, + self.model_.update(texts, annotations, sgd=optimizer, - drop=dropout, + drop=self.dropout, losses=losses) loss_value = losses["ner"] log.info("Epoch: {:d} loss: {:.5f}".format(it, loss_value)) @@ -93,14 +94,14 @@ def predict(self, X): Returns: y (list(list(str))): list of list of predicted BIO tags. """ - if self.model is None: + if self.model_ is None: raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") log.info("Generating predictions...") preds = [] for sent_tokens in X: sent = " ".join(sent_tokens) - doc = self.model(sent) + doc = self.model_(sent) sent_preds = self._convert_from_spacy(sent, doc.ents) preds.append(sent_preds) @@ -113,13 +114,13 @@ def save(self, dirpath): Args: dirpath (str): path to model directory. """ - if self.model is None: + if self.model_ is None: raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model") log.info("Saving model...") if not os.path.exists(dirpath): os.makedirs(dirpath) - self.model.to_disk(dirpath) + self.model_.to_disk(dirpath) def load(self, dirpath): @@ -134,7 +135,7 @@ def load(self, dirpath): raise ValueError("Model directory {:s} not found".format(dirpath)) log.info("Loading model...") - self.model = spacy.load(dirpath) + self.model_ = spacy.load(dirpath) return self @@ -174,6 +175,7 @@ def _convert_from_spacy(self, sent, entities): sentence. """ spans = [(e.start_char, e.end_char, e.label_) for e in entities] - tokens, tags = spans_to_tokens(sent, spans, self.spacy_lm, spans_are_multiword=False) + tokens, tags = spans_to_tokens(sent, spans, self._spacy_lm, + spans_are_multiword=False) return tags diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py index 60c9466..932fba0 100644 --- a/nerds/test/test_bilstm_ner.py +++ b/nerds/test/test_bilstm_ner.py @@ -7,8 +7,8 @@ def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") - model = BiLstmCrfNER() - model.fit(X, y, num_epochs=1) + model = BiLstmCrfNER(max_iter=1) + model.fit(X, y) model.save("nerds/test/data/models") model_r = model.load("nerds/test/data/models") y_pred = model_r.predict(X) diff --git a/nerds/test/test_crf_ner.py b/nerds/test/test_crf_ner.py index 1ae1686..f296d1f 100644 --- a/nerds/test/test_crf_ner.py +++ b/nerds/test/test_crf_ner.py @@ -15,3 +15,18 @@ def test_crf_ner(): assert_equal(y, y_pred, "Label and prediction must be equal") assert_equal(1.0, model.score(X, y)) shutil.rmtree("nerds/test/data/models") + + +def test_crf_ner_with_nondefault_features(): + def my_test_featurizer(sentence): + return [{"word":token} for token in sentence] + + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = CrfNER(featurizer=my_test_featurizer) + model.fit(X, y) + y_pred = model.predict(X) + # our features are not good enough to do good predictions, so just + # check the lengths of labels vs predictions to make sure it worked + assert_equal(len(y), len(y_pred), "Number of label and predictions must be equal.") + assert_equal(len(y[0]), len(y_pred[0]), "Size of label and predictions must match (1).") + assert_equal(len(y[1]), len(y_pred[1]), "Size of label and predictions must match (2).") diff --git a/nerds/test/test_dictionary_ner.py b/nerds/test/test_dictionary_ner.py index a24b238..c4daa7b 100644 --- a/nerds/test/test_dictionary_ner.py +++ b/nerds/test/test_dictionary_ner.py @@ -26,8 +26,8 @@ def test_dictionary_ner_from_dict(): xs.append(x) ys.append(y) fdict.close() - model = DictionaryNER() - model.fit(xs, ys, combine_tokens=False) + model = DictionaryNER(from_dictionary=True) + model.fit(xs, ys) # predict using example X, y = load_data_and_labels("nerds/test/data/example.iob") y_pred = model.predict(X) diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py index 8762406..2ee8bf4 100644 --- a/nerds/test/test_elmo_ner.py +++ b/nerds/test/test_elmo_ner.py @@ -8,10 +8,10 @@ def test_crf_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") - model = ElmoNER() # there are 28 unique words in our "vocabulary" embeddings = np.random.random((28, 100)) - model.fit(X, y, embeddings=embeddings, num_epochs=1) + model = ElmoNER(embeddings=embeddings, max_iter=1) + model.fit(X, y) model.save("nerds/test/data/models") model_r = model.load("nerds/test/data/models") y_pred = model_r.predict(X) diff --git a/nerds/test/test_ensemble_ner.py b/nerds/test/test_ensemble_ner.py index 19eca34..5659b71 100644 --- a/nerds/test/test_ensemble_ner.py +++ b/nerds/test/test_ensemble_ner.py @@ -3,15 +3,17 @@ from nerds.models import DictionaryNER, CrfNER, SpacyNER, EnsembleNER from nerds.utils import load_data_and_labels +from sklearn.ensemble import VotingClassifier + def test_ensemble_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") estimators = [ - (DictionaryNER(), {}), - (CrfNER(), {"max_iterations": 1}), - (SpacyNER(), {"num_epochs": 1}) + ("dict_ner", DictionaryNER()), + ("crf_ner", CrfNER(max_iter=1)), + ("spacy_ner", SpacyNER(max_iter=1)) ] - model = EnsembleNER() - model.fit(X, y, estimators=estimators) + model = EnsembleNER(estimators=estimators) + model.fit(X, y) y_pred = model.predict(X) assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") From 1952e4d403943c4d78c5b099cdca5c554d85e38f Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 12:45:01 -0800 Subject: [PATCH 34/64] moving fit keyword args to constructor parameters per recommendation for Scikit-learn estimators --- nerds/models/elmo.py | 134 +++++++++++++++++++---------------- nerds/test/test_spacy_ner.py | 1 - 2 files changed, 74 insertions(+), 61 deletions(-) diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py index 33d6ca6..a86ac08 100644 --- a/nerds/models/elmo.py +++ b/nerds/models/elmo.py @@ -18,22 +18,7 @@ class ElmoNER(NERModel): - def __init__(self, entity_label=None): - """ Build a Anago Bi-LSTM CRF model. - - Args: - entity_label: label for single entity NER, default None - """ - super().__init__(entity_label) - self.key = "anago_elmo" - # populated by fit() and load(), expected by save() and transform() - self.preprocessor = None - self.model = None - self.trainer = None - self.tagger = None - - - def fit(self, X, y, + def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, @@ -44,12 +29,15 @@ def fit(self, X, y, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, - num_epochs=2): - """ Trains the NER model. Input is list of AnnotatedDocuments. + max_iter=2): + """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF + model except that the word embeddings are contextual, since they are + returned by a trained ELMo model. ELMo model requires an additional + embedding, which is Glove-100 by default. ELMo model is provided by + the (dev) Anago project. Args: - X list(list(str)): list of list of tokens - y list(list(str)): list of list of BIO tags + entity_label: label for single entity NER, default None word_embedding_dim (int): word embedding dimensions. char_embedding_dim (int): character embedding dimensions. word_lstm_size (int): character LSTM feature extractor output dimensions. @@ -60,50 +48,76 @@ def fit(self, X, y, embeddings_file (str): path to embedding file. use_char (boolean): add char feature. use_crf (boolean): use crf as last layer. - batch_size training batch size. - learning_rate learning rate for Adam optimizer. - num_epochs number of epochs of training. + batch_size (int): training batch size. + learning_rate (float): learning rate for Adam optimizer. + max_iter (int): number of epochs of training. + """ + super().__init__() + self.word_embedding_dim = word_embedding_dim + self.char_embedding_dim = char_embedding_dim + self.word_lstm_size = word_lstm_size + self.char_lstm_size = char_lstm_size + self.fc_dim = fc_dim + self.dropout = dropout + self.embeddings = embeddings + self.embeddings_file = embeddings_file + self.batch_size = batch_size + self.learning_rate = learning_rate + self.max_iter = max_iter + # populated by fit() and load(), expected by save() and transform() + self.preprocessor_ = None + self.model_ = None + self.trainer_ = None + self.tagger_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of AnnotatedDocuments. + + Args: + X list(list(str)): list of list of tokens + y list(list(str)): list of list of BIO tags """ - if embeddings is None and (embeddings_file is None or embeddings_dim is None): - raise ValueError("Either embeddings should be provided, or both embeddings_file and embeddings_dim should be provided, exiting.") + if self.embeddings is None and self.embeddings_file is None: + raise ValueError("Either embeddings or embeddings_file should be provided, exiting.") log.info("Preprocessing dataset...") - self.preprocessor = ELMoTransformer() - self.preprocessor.fit(X, y) - - if embeddings is None: - embeddings = load_glove(embeddings_file) - embeddings_dim != embeddings[list(embeddings.keys())[0]].shape[0] - embeddings = filter_embeddings(embeddings, - self.preprocessor._word_vocab.vocab, + self.preprocessor_ = ELMoTransformer() + self.preprocessor_.fit(X, y) + + if self.embeddings is None: + self.embeddings = load_glove(self.embeddings_file) + embeddings_dim != self.embeddings[list(self.embeddings.keys())[0]].shape[0] + self.embeddings = filter_embeddings(self.embeddings, + self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") - self.model = ELModel( - char_embedding_dim=char_embedding_dim, - word_embedding_dim=word_embedding_dim, - char_lstm_size=char_lstm_size, - word_lstm_size=word_lstm_size, - char_vocab_size=self.preprocessor.char_vocab_size, - word_vocab_size=self.preprocessor.word_vocab_size, - num_labels=self.preprocessor.label_size, - embeddings=embeddings, - dropout=dropout) - - self.model, loss = self.model.build() - optimizer = Adam(lr=learning_rate) - self.model.compile(loss=loss, optimizer=optimizer) - self.model.summary() + self.model_ = ELModel( + char_embedding_dim=self.char_embedding_dim, + word_embedding_dim=self.word_embedding_dim, + char_lstm_size=self.char_lstm_size, + word_lstm_size=self.word_lstm_size, + char_vocab_size=self.preprocessor_.char_vocab_size, + word_vocab_size=self.preprocessor_.word_vocab_size, + num_labels=self.preprocessor_.label_size, + embeddings=self.embeddings, + dropout=self.dropout) + + self.model_, loss = self.model_.build() + optimizer = Adam(lr=self.learning_rate) + self.model_.compile(loss=loss, optimizer=optimizer) + self.model_.summary() log.info('Training the model...') - self.trainer = Trainer(self.model, preprocessor=self.preprocessor) + self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) - self.trainer.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, - batch_size=batch_size, epochs=num_epochs) + self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=self.batch_size, epochs=self.max_iter) - self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self @@ -116,11 +130,11 @@ def predict(self, X): Returns: y list(list(str)): list of list of predicted BIO tags. """ - if self.tagger is None: + if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") log.info("Predicting from model...") - ypreds = [self.tagger.predict(" ".join(x)) for x in X] + ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds @@ -132,7 +146,7 @@ def save(self, dirpath): Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. """ - if self.model is None or self.preprocessor is None: + if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") if not os.path.exists(dirpath): @@ -142,8 +156,8 @@ def save(self, dirpath): params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") - save_model(self.model, weights_file, params_file) - self.preprocessor.save(preprocessor_file) + save_model(self.model_, weights_file, params_file) + self.preprocessor_.save(preprocessor_file) def load(self, dirpath): @@ -164,9 +178,9 @@ def load(self, dirpath): os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") - self.model = load_model(weights_file, params_file) - self.preprocessor = ELMoTransformer.load(preprocessor_file) - self.tagger = Tagger(self.model, preprocessor=self.preprocessor) + self.model_ = load_model(weights_file, params_file) + self.preprocessor_ = ELMoTransformer.load(preprocessor_file) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py index db1bc37..0d800ce 100644 --- a/nerds/test/test_spacy_ner.py +++ b/nerds/test/test_spacy_ner.py @@ -15,4 +15,3 @@ def test_spacy_ner(): assert_equal(y, y_pred, "Label and prediction must be equal") assert_equal(1.0, model.score(X, y)) shutil.rmtree("nerds/test/data/models") - # assert_true(False) \ No newline at end of file From d3cf61659897c68c40ad1a487e7b36e9ccfb1b77 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 16:50:33 -0800 Subject: [PATCH 35/64] replace vocab building using LabelEncoder --- nerds/models/ensemble.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index 9337b12..c02952d 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -1,6 +1,8 @@ from nerds.models import NERModel from nerds.utils import get_logger +from sklearn.preprocessing import LabelEncoder + import joblib import numpy as np import os @@ -152,13 +154,13 @@ def _vote(self, predictions): def _build_label_vocab(self, predictions): """ build lookup table from token to int and back (for performance) """ - tag2int, int2tag = {}, {} - tok_int = 1 + tags, tag2int, int2tag = [], {}, {} + label_encoder = LabelEncoder() for est_pred in predictions: for sent_pred in est_pred: for tok_pred in sent_pred: - if tok_pred not in tag2int.keys(): - tag2int[tok_pred] = tok_int - tok_int += 1 - int2tag = {v:k for k, v in tag2int.items()} + tags.append(tok_pred) + label_encoder.fit(tags) + tag2int = {t:i for i, t in enumerate(label_encoder.classes_)} + int2tag = {i:t for t, i in tag2int.items()} return tag2int, int2tag From 26d62a33a698584c182907f7b3a86b0b1c2c4c48 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 17:09:11 -0800 Subject: [PATCH 36/64] Ensemble NER can run fit() and predict() in parallel --- docs/CHANGES.md | 6 +++--- nerds/models/ensemble.py | 31 ++++++++++++++----------------- nerds/test/test_ensemble_ner.py | 13 +++++++++++++ 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index 1cd3751..829b47c 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -30,7 +30,7 @@ * EnsembleNER * simpler interface * weights from each classifier - * joblib.Parallel -- improve? + * fit() and predict() can use multiple parallel jobs (`n_jobs`). * Utils * Thin wrapper over anago's `load_data_and_labels` * `flatten_list` and `unflatten_list` to convert between `list(list(str))` produced by NERDS models and `list(str)` required by `sklearn`, scikit-learn metrics can be used. @@ -38,8 +38,8 @@ * Converters * Converter from BRAT (.txt and .ann) to IOB format * Miscellaneous - * remove deprecated sklearn.external.joblib -> joblib - * True Scikit-Learn interoperability -- some progress has been made, parameters are now provided in constructor rather than as keywords in the `fit()` call. However, `check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are list(list(str)) rather than list(str). + * replaced deprecated sklearn.external.joblib -> joblib + * True Scikit-Learn interoperability -- moving parameters to constructor has resulted in estimators being serializable and usable via joblib.Parallel. However, `sklearn.utils.check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are `list(list(str))` rather than `list(str)`. ## Planned diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index c02952d..ac7483f 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -14,6 +14,7 @@ class EnsembleNER(NERModel): def __init__(self, estimators=[], weights=None, + n_jobs=1, is_pretrained=False): """ Constructor for Voting Ensemble NER. @@ -26,7 +27,11 @@ def __init__(self, apply to predicted class labels from each estimator. If None, then predictions from all estimators are treated equally. - + n_jobs (int, default=1): number of jobs to run in parallel, + default is to single-thread. -1 means to use all available + resources. + is_pretrained (bool, default False): if True, estimators are + assumed to be pretrained and fit() is skipped. """ super().__init__() # these are set by fit and load, required by predict and save @@ -53,19 +58,11 @@ def fit(self, X, y): if self.is_pretrained: return self - # various pickling errors are seen if we use joblib.Parallel to fit - # in parallel across multiple processors. Since normal usage should - # not involve calling fit(), this is okay to keep as sequential. - fitted_estimators = [self._fit_estimator(clf, X, y) - for name, clf in self.estimators] - self.estimators = [(name, fitted) for (name, clf), fitted in - zip(self.estimators, fitted_estimators)] - - # fitted_estimators = joblib.Parallel(n_jobs=-1, backend="threading")( - # map(lambda clf: joblib.delayed(self._fit_estimator(clf[1], X, y)), - # self.estimators)) - # self.estimators = [(name, fitted) for (name, clf), fitted - # in zip(self.estimators, fitted_estimators)] + fitted_estimators = joblib.Parallel(n_jobs=-1, backend="threading")( + joblib.delayed(self._fit_estimator)(clf, X, y) + for name, clf in self.estimators) + self.estimators = [(name, fitted) for (name, clf), fitted + in zip(self.estimators, fitted_estimators)] return self @@ -85,9 +82,9 @@ def predict(self, X): if self.estimators is None or self.weights is None: raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") - predictions = [] - for _, clf in self.estimators: - predictions.append(clf.predict(X)) + predictions = joblib.Parallel(n_jobs=-1, backend="threading")( + joblib.delayed(self._predict_estimator)(clf, X) + for name, clf in self.estimators) return self._vote(predictions) diff --git a/nerds/test/test_ensemble_ner.py b/nerds/test/test_ensemble_ner.py index 5659b71..05ba593 100644 --- a/nerds/test/test_ensemble_ner.py +++ b/nerds/test/test_ensemble_ner.py @@ -18,3 +18,16 @@ def test_ensemble_ner(): assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") + +def test_ensemble_ner_multithreaded(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + estimators = [ + ("dict_ner", DictionaryNER()), + ("crf_ner", CrfNER(max_iter=1)), + ("spacy_ner", SpacyNER(max_iter=1)) + ] + model = EnsembleNER(estimators=estimators, n_jobs=-1) + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") + assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") From 24b3b56cd628dabca686b4db708efa4ca7b20c8c Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 17:18:20 -0800 Subject: [PATCH 37/64] update to n_jobs usage in Ensemble NER --- docs/CHANGES.md | 2 +- nerds/models/ensemble.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index 829b47c..02eadcb 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -39,7 +39,7 @@ * Converter from BRAT (.txt and .ann) to IOB format * Miscellaneous * replaced deprecated sklearn.external.joblib -> joblib - * True Scikit-Learn interoperability -- moving parameters to constructor has resulted in estimators being serializable and usable via joblib.Parallel. However, `sklearn.utils.check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are `list(list(str))` rather than `list(str)`. + * True Scikit-Learn interoperability -- moved parameters to constructor. However, `sklearn.utils.check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are `list(list(str))` rather than `list(str)`. ## Planned diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index ac7483f..f40addc 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -37,6 +37,7 @@ def __init__(self, # these are set by fit and load, required by predict and save self.estimators = estimators self.weights = weights + self.n_jobs = n_jobs self.is_pretrained=is_pretrained @@ -58,7 +59,7 @@ def fit(self, X, y): if self.is_pretrained: return self - fitted_estimators = joblib.Parallel(n_jobs=-1, backend="threading")( + fitted_estimators = joblib.Parallel(n_jobs=self.n_jobs, backend="threading")( joblib.delayed(self._fit_estimator)(clf, X, y) for name, clf in self.estimators) self.estimators = [(name, fitted) for (name, clf), fitted @@ -82,7 +83,7 @@ def predict(self, X): if self.estimators is None or self.weights is None: raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") - predictions = joblib.Parallel(n_jobs=-1, backend="threading")( + predictions = joblib.Parallel(n_jobs=self.n_jobs, backend="threading")( joblib.delayed(self._predict_estimator)(clf, X) for name, clf in self.estimators) From b6e9e2ad5f3ab790d8b4f9fab96df6d4a311a7da Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 18:22:26 -0800 Subject: [PATCH 38/64] documentation changed to numpy docstring format. --- nerds/models/base.py | 13 ++++ nerds/models/bilstm.py | 93 +++++++++++++++++------- nerds/models/crf.py | 104 ++++++++++++++++----------- nerds/models/dictionary.py | 140 ++++++++++++++++++++++++------------- nerds/models/elmo.py | 96 +++++++++++++++++-------- nerds/models/ensemble.py | 80 +++++++++++++-------- nerds/models/spacy.py | 99 +++++++++++++++++--------- 7 files changed, 423 insertions(+), 202 deletions(-) diff --git a/nerds/models/base.py b/nerds/models/base.py index 798b5e3..f65c12f 100644 --- a/nerds/models/base.py +++ b/nerds/models/base.py @@ -35,6 +35,19 @@ def load(self, file_path): def score(self, X, y, sample_weights=None): """ Returns score for the model based on predicting on (X, y). This method is needed for GridSearch like operations. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + y : list(list(str)) + list of list of tags + sample_weights : list(float), not used + + Returns + ------- + score: float + numeric score for estimator. """ y_pred = self.predict(X) return accuracy_score(flatten_list(y), flatten_list(y_pred)) diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py index 8abcbc8..72219af 100644 --- a/nerds/models/bilstm.py +++ b/nerds/models/bilstm.py @@ -14,7 +14,6 @@ log = get_logger() - class BiLstmCrfNER(NERModel): def __init__(self, @@ -34,19 +33,39 @@ def __init__(self, level embeddings as well as word embeddings by default. Implementation is provided by the Anago project. - Args: - word_embedding_dim (int): word embedding dimensions. - char_embedding_dim (int): character embedding dimensions. - word_lstm_size (int): character LSTM feature extractor output dimensions. - char_lstm_size (int): word tagger LSTM output dimensions. - fc_dim (int): output fully-connected layer size. - dropout (float): dropout rate. - embeddings (numpy array): word embedding matrix. - use_char (boolean): add char feature. - use_crf (boolean): use crf as last layer. - batch_size (int): training batch size. - learning_rate (float): learning rate for Adam optimizer. - num_epochs (int): number of epochs of training. + Parameters + ---------- + word_embedding_dim : int, optional, default 100 + word embedding dimensions. + char_embedding_dim : int, optional, default 25 + character embedding dimensions. + word_lstm_size : int, optional, default 100 + character LSTM feature extractor output dimensions. + char_lstm_size : int, optional, default 25 + word tagger LSTM output dimensions. + fc_dim : int, optional, default 100 + output fully-connected layer size. + dropout : float, optional, default 0.5 + dropout rate. + embeddings : numpy array + word embedding matrix. + use_char : bool, optional, default True + add char feature. + use_crf : bool, optional, default True + use crf as last layer. + batch_size : int, optional, default 16 + training batch size. + learning_rate : float, optional, default 0.001 + learning rate for Adam optimizer + max_iter : int + number of epochs of training + + Attributes + ---------- + preprocessor_ : reference to preprocessor + model_ : reference to generated model + trainer_ : internal reference to Anago Trainer (model) + tagger_ : internal reference to Anago Tagger (predictor) """ super().__init__() self.word_embedding_dim = word_embedding_dim @@ -71,9 +90,16 @@ def __init__(self, def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. - Args: - X list(list(str)): list of list of tokens - y list(list(str)): list of list of BIO tags + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags + + Returns + ------- + self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) @@ -112,10 +138,15 @@ def fit(self, X, y): def predict(self, X): """ Predicts using the NER model. - Args: - X list(list(str)): list of list of tokens. - Returns: - y list(list(str)): list of list of predicted BIO tags. + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") @@ -128,10 +159,16 @@ def predict(self, X): def save(self, dirpath): """ Saves model to local disk, given a dirpath - Args: - dirpath (str): a directory where model artifacts will be saved. + Parameters + ---------- + dirpath : str + a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. + + Returns + ------- + None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") @@ -150,8 +187,14 @@ def save(self, dirpath): def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath - Args: - dirpath (str): a directory where model artifacts are saved. + Parameters + ---------- + dirpath : str + a directory where model artifacts are saved. + + Returns + ------- + self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) diff --git a/nerds/models/crf.py b/nerds/models/crf.py index 69bf656..0816d0f 100644 --- a/nerds/models/crf.py +++ b/nerds/models/crf.py @@ -19,17 +19,24 @@ def __init__(self, """ Construct a Conditional Random Fields (CRF) based NER. Implementation of CRF NER is provided by sklearn.crfsuite.CRF. - Args: - max_iter (int, default 100): maximum number of iterations to run - CRF training - c1 (float, default 0.1): L1 regularization coefficient. - c2 (float, default 0.1): L2 regularization coefficient. - featurizer (function, default None): if None, the default featurizer - _sent2features() is used to convert list of tokens for each - sentence to a list of features, where each feature is a dictionary - of name-value pairs. For custom features, a featurizer function must - be provided that takes in a list of tokens (sentence) and returns a - list of features. + Parameters + ---------- + max_iter : int, optional, default 100 + maximum number of iterations to run CRF training + c1 : float, optional, default 0.1 + L1 regularization coefficient. + c2 : float, optional, default 0.1 + L2 regularization coefficient. + featurizer : function, default None + if None, the default featurizer _sent2features() is used to convert + list of tokens for each sentence to a list of features, where each + feature is a dictionary of name-value pairs. For custom features, a + featurizer function must be provided that takes in a list of tokens + (sentence) and returns a list of features. + + Attributes + ---------- + model_ : reference to the internal sklearn_crfsuite.CRF model. """ super().__init__() self.max_iter = max_iter @@ -42,16 +49,19 @@ def __init__(self, def fit(self, X, y): """ Build feature vectors and train CRF model. Wrapper for - sklearn_crfsuite.CRF model. The underlying model takes many - parameters (for full list (and possible future enhancement), see - https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/estimator.html#CRF) - - Args: - X (list(list(str))) or (list(list(dict(str, str)))): list of - sentences or features. Sentences are tokenized into list - of words, and features are a list of word features, each - word feature is a dictionary of name-value pairs. - y (list(list(str))): list of list of BIO tags. + sklearn_crfsuite.CRF model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list + of words. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self """ if self.featurizer is None: features = [self._sent2features(sent) for sent in X] @@ -76,13 +86,15 @@ def fit(self, X, y): def predict(self, X): """ Predicts using trained CRF model. - Args: - X (list(list(dict(str, str))) or list(list(str))): list - of sentences or features. - is_featurized (bool, default False): if True, X is a list - of list of features, else X is a list of list of tokens. - Returns: - y (list(list(str))): list of list of predicted BIO tags. + Parameters + ---------- + X : list(list(dict(str, str)) + list of sentences. Sentences are tokenized into list of words. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. """ if self.model_ is None: raise ValueError("CRF model not found, run fit() to train or load() pre-trained model") @@ -98,8 +110,14 @@ def predict(self, X): def save(self, dirpath): """ Save a trained CRF model at dirpath. - Args: - dirpath (str): path to model directory. + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None """ if self.model_ is None: raise ValueError("No model to save, run fit() to train or load() pre-trained model") @@ -114,10 +132,14 @@ def save(self, dirpath): def load(self, dirpath): """ Load a pre-trained CRF model from dirpath. - Args: - dirpath (str): path to model directory. - Returns: - this object populated with pre-trained model. + Parameters + ----------- + dirpath : str + path to model directory. + + Returns + -------- + self """ model_file = os.path.join(dirpath, "crf-model.pkl") if not os.path.exists(model_file): @@ -135,12 +157,16 @@ def _sent2features(self, sent): """ Converts a list of tokens to a list of features for CRF. Each feature is a dictionary of feature name value pairs. - Args: - sent (list(str)): a list of tokens representing a sentence. + Parameters + ---------- + sent : list(str)) + a list of tokens representing a sentence. - Returns: - feats (list(dict(str, obj))): a list of features, where each - feature is a dictionary of name-value pairs. + Returns + ------- + feats : list(dict(str, obj)) + a list of features, where each feature represents a token + as a dictionary of name-value pairs. """ if self._nlp is None: self._nlp = self._load_language_model() diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index 187d079..70ee863 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -15,12 +15,17 @@ def __init__(self, from_dictionary=False): like a gazetteer, and is based on the Aho-Corasick algorithm implemented by the pyAhoCorasick package. - Args: - from_dictionary (bool, default False): if True, input is - multi-word phrases representing entities, otherwise - input is potentially multi-word phrases annotated as - a sequence of (token, tag) pairs. See fit(X, y) for - more information. + Parameters + ---------- + from_dictionary : bool, optional, default False + if True, input is multi-word phrases representing entities, + otherwise input is potentially multi-word phrases annotated as + a sequence of (token, tag) pairs. See fit(X, y) for more + information. + + Attributes + ---------- + model_ : reference to internal pyAhoCorasick Automaton. """ super().__init__() self.from_dictionary = from_dictionary @@ -31,27 +36,16 @@ def __init__(self, from_dictionary=False): def fit(self, X, y): """ Build dictionary of phrases of different entity types. - Args: - X (list(list(str))): list of list of tokens or phrases. - combine_tokens (bool, default True): if combine tokens - is True, then input is tokenized as individual words. - This would be the expected format if the input came - directly from a training set. - - X = [..., [..., "New", "York", "City", ...], ...] - y = [..., [..., "B-loc", "I-loc", "I-loc", ...], ...] - - If combine_tokens is False, then phrases have been - pre-chunked. This would be the expected format if the - input came from a third party dictionary. - - X = [..., [..., "New York City", ...], ...] - y = [..., [..., "loc", ...], ...] - - y (list(list(str))): list of list of labels. If combine_tokens - is True, then labels are IOB tags. If combine_tokens is False, - labels are entity types (without leading B and I), and without - any O labels. + Parameters + ---------- + X : list(list(str)) + list of list of tokens or phrases. + y : list(list(str)) + list of list of labels. + + Returns + ------- + self """ self.model_ = ahocorasick.Automaton() if self.from_dictionary: @@ -67,6 +61,18 @@ def fit(self, X, y): def predict(self, X): + """ Finds matches in text from entries in the Automaton object. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + ypred : list(list(str)) + list of list of predicted BIO tags. + """ if self.model_ is None: raise ValueError("No model found, use fit() to train or load() pretrained.") @@ -91,6 +97,17 @@ def predict(self, X): def save(self, dirpath=None): + """ Saves picked automaton object into dirpath. + + Parameters + ---------- + dirpath : str + path to directory where model will be saved + + Returns + ------- + None + """ if self.model_ is None: raise ValueError("No model found, use fit() to train or load() pretrained.") @@ -103,6 +120,17 @@ def save(self, dirpath=None): def load(self, dirpath=None): + """ Loads model from disk from dirpath. + + Parameters + ---------- + dirpath : str + path to directory where model will be retrieved. + + Returns + ------- + self + """ model_file = os.path.join(dirpath, "dictionary-ner.pkl") if not os.path.exists(model_file): raise ValueError("Saved model {:s} not found.".format(model_file)) @@ -119,13 +147,19 @@ def _combine_tokens(self, tokens, labels): """ Combine consecutive word tokens for some given entity type to create phrase tokens. - Args: - tokens (list(str)): a list of tokens representing a sentence. - labels (list(str)): a list of IOB tags for sentence. - - Returns: - phrases (list(str)): list of multi-word phrases. - phrase_labels (list(str)): list of phrase entity types. + Parameters + ---------- + tokens : list(str) + a list of tokens representing a sentence. + labels : list(str) + a list of IOB tags for sentence. + + Returns + ------- + phrases : list(str) + list of multi-word phrases. + phrase_labels : list(str) + list of phrase entity types. """ phrases, phrase_labels = [], [] phrase_tokens = [] @@ -153,13 +187,22 @@ def _add_if_not_spurious_match(self, start_index, end_index, tag, parts of longer words. This function checks to make sure any matches it reports don't do so. - Args: - start_index (int): reported start index of matched phrase. - end_index (int): reported end index of matched phrase. - tag (str): the entity type. - sentence (str): the sentence in which match occurs. - matched_phrases (list(str)): list of matched phrases, updated - in place by function. + Parameters + ---------- + start_index : int + reported start index of matched phrase. + end_index : int + reported end index of matched phrase. + tag : str + the entity type. + sentence : str + the sentence in which match occurs. + matched_phrases : list(str) + list of matched phrases, updated in place by function. + + Returns + ------- + None """ if start_index == 0: if end_index + 1 < len(sentence): @@ -184,13 +227,16 @@ def _remove_subsumed_matches(self, matched_phrases, k): phrase to match against. Function stops when we have seen all the phrases. - Args: - matched_phrases (list((start, end, iob_tag))): list of - matched phrase tuples. - k (int): starting position. + Parameters + ---------- + matched_phrases : list((start, end, iob_tag)) + list of matched phrase tuples. + k : int + starting position. - Returns: - matched_phrases: without the shorter subsumed phrase tuples. + Returns + ------- + matched_phrases without shorter subsumed phrase tuples. """ if k >= len(matched_phrases): return matched_phrases diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py index a86ac08..c69cdf1 100644 --- a/nerds/models/elmo.py +++ b/nerds/models/elmo.py @@ -36,21 +36,37 @@ def __init__(self, embedding, which is Glove-100 by default. ELMo model is provided by the (dev) Anago project. - Args: - entity_label: label for single entity NER, default None - word_embedding_dim (int): word embedding dimensions. - char_embedding_dim (int): character embedding dimensions. - word_lstm_size (int): character LSTM feature extractor output dimensions. - char_lstm_size (int): word tagger LSTM output dimensions. - fc_dim (int): output fully-connected layer size. - dropout (float): dropout rate. - embeddings (numpy array): word embedding matrix. - embeddings_file (str): path to embedding file. - use_char (boolean): add char feature. - use_crf (boolean): use crf as last layer. - batch_size (int): training batch size. - learning_rate (float): learning rate for Adam optimizer. - max_iter (int): number of epochs of training. + Parameters + ---------- + word_embedding_dim : int, optional, default 100 + word embedding dimensions. + char_embedding_dim : int, optional, default 25 + character embedding dimensions. + word_lstm_size: int, optional, default 100 + character LSTM feature extractor output dimensions. + char_lstm_size : int, optional, default 25 + word tagger LSTM output dimensions. + fc_dim : int, optional, default 100 + output fully-connected layer size. + dropout : float, optional, default 0.5 + dropout rate. + embeddings : numpy array + word embedding matrix. + embeddings_file : str + path to embedding file. + batch_size : int, optional, default 16 + training batch size. + learning_rate : float, optional, default 0.001 + learning rate for Adam optimizer. + max_iter : int, optional, default 2 + number of epochs of training. + + Attributes + ---------- + preprocessor_ : reference to Anago preprocessor. + model_ : reference to the internal Anago ELModel + trainer_ : reference to the internal Anago Trainer object. + tagger_ : reference to the internal Anago Tagger object. """ super().__init__() self.word_embedding_dim = word_embedding_dim @@ -74,9 +90,16 @@ def __init__(self, def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. - Args: - X list(list(str)): list of list of tokens - y list(list(str)): list of list of BIO tags + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags + + Returns + ------- + self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError("Either embeddings or embeddings_file should be provided, exiting.") @@ -125,10 +148,15 @@ def fit(self, X, y): def predict(self, X): """ Predicts using the NER model. - Args: - X list(list(str)): list of list of tokens. - Returns: - y list(list(str)): list of list of predicted BIO tags. + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") @@ -141,10 +169,16 @@ def predict(self, X): def save(self, dirpath): """ Saves model to local disk, given a dirpath - Args: - dirpath (str): a directory where model artifacts will be saved. - Model saves a weights.h5 weights file, a params.json parameter - file, and a preprocessor.pkl preprocessor file. + Parameters + ----------- + dirpath : str + a directory where model artifacts will be saved. Model saves a + weights.h5 weights file, a params.json parameter file, and a + preprocessor.pkl preprocessor file. + + Returns + ------- + None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") @@ -163,8 +197,14 @@ def save(self, dirpath): def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath - Args: - dirpath (str): a directory where model artifacts are saved. + Parameters + ---------- + dirpath : str + a directory where model artifacts are saved. + + Returns + ------- + self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index f40addc..5efdba7 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -18,20 +18,24 @@ def __init__(self, is_pretrained=False): """ Constructor for Voting Ensemble NER. - Args: - estimators (list(NERModel, dict(str,obj)), default empty): list - of (NERModels, fit_param) pairs to use in the ensemble. The - fit_param is a flat dictionary of named arguments used in - fit() for the particular NERModel. - weights (list(int), default None): sequence of weights to - apply to predicted class labels from each estimator. If - None, then predictions from all estimators are treated - equally. - n_jobs (int, default=1): number of jobs to run in parallel, - default is to single-thread. -1 means to use all available - resources. - is_pretrained (bool, default False): if True, estimators are - assumed to be pretrained and fit() is skipped. + Parameters + ---------- + estimators : list((str, NERModel)) + list of (name, NERModel) tuples of models in the ensemble. + weights : list(int), optional + sequence of weights to apply to predicted class labels from + each estimator. If None, then predictions from all estimators + are weighted equally. + n_jobs : int, default=1 + number of jobs to run in parallel, default is to single-thread. + -1 means to use all available resources. + is_pretrained : bool, default False + if True, estimators are assumed to be pretrained and fit() + is skipped. + + Attributes + ---------- + None """ super().__init__() # these are set by fit and load, required by predict and save @@ -44,9 +48,16 @@ def __init__(self, def fit(self, X, y): """ Train ensemble by training underlying NERModels. - Args: - X (list(list(str))): list of list of tokens. - y (list(list(str))): list of list of BIO tags. + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self """ if self.estimators is None or len(self.estimators) == 0: raise ValueError("Non-empty list of estimators required to fit ensemble.") @@ -74,11 +85,15 @@ def predict(self, X): predictions using a voting scheme given by the vote() method (subclasses can override voting policy by overriding vote()). - Args: - X (list(list(str))): list of list of tokens to predict from. + Parameters + ---------- + X : list(list(str)) + list of list of tokens to predict from. - Returns: - ypred (list(list(str))): list of list of BIO tags. + Returns + ------- + ypred : list(list(str)) + list of list of BIO tags predicted by model. """ if self.estimators is None or self.weights is None: raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") @@ -111,15 +126,20 @@ def _vote(self, predictions): """ Voting mechanism (can be overriden by subclass if desired). - Args: - predictions (list(list(list(str)))): a list of list of list of BIO - tags predicted by each NER in the ensemble. Each NER outputs - a list of list of BIO tags where the outer list corresponds - to sentences and the inner list corresponds to tokens. - - Returns: - voted_predictions (list(list(str))): a list of list of BIO tags. - Each BIO tag represents the most frequent tag + Parameters + ---------- + predictions : list(list(list(str))) + List of list of list of BIO tags predicted by each NER in + the ensemble. Each NER outputs a list of list of BIO tags + where the outer list corresponds to sentences and the inner + list corresponds to tokens. + + Returns + ------- + voted_predictions : list(list(str)) + List of list of BIO tags. Output BIO tag at each position + is the one that is predicted by the majority of NERs in + the ensemble. """ tag2int, int2tag = self._build_label_vocab(predictions) diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 5474c43..8540490 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -20,11 +20,18 @@ def __init__(self, """ Construct a SpaCy based NER. The SpaCy library provides an EntityRecognizer class to do Named Entity Recognition. - Args: - dropout (float): rate of dropout during training between 0 and 1. - max_iter (int): number of epochs of training. - batch_size (int): batch size to use during training - + Parameters + ---------- + dropout : float, optional, default 0.1 + rate of dropout during training between 0 and 1. + max_iter : int, optional, default 20 + number of epochs of training. + batch_size : int, optional, default 32 + batch size to use during training + + Attributes + ---------- + model_ : reference to internal SpaCy EntityRecognizer model. """ super().__init__() self.dropout = dropout @@ -37,10 +44,16 @@ class to do Named Entity Recognition. def fit(self, X, y): """ Trains the SpaCy NER model. - Args: - X (list(list(str))): list of tokenized sentences, or list of list - of tokens. - y (list(list(str))): list of list of BIO tags. + Parameters + ---------- + X : list(list(str)) + list of tokenized sentences, or list of list of tokens. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self """ log.info("Reformatting data to SpaCy format...") features = [self._convert_to_spacy(tokens, labels) @@ -87,12 +100,15 @@ def fit(self, X, y): def predict(self, X): """ Predicts using trained SpaCy NER model. - Args: - X (list(list(str))): list of tokenized sentences. - is_featurized (bool, default False): if True, X is a list - of list of features, else X is a list of list of tokens. - Returns: - y (list(list(str))): list of list of predicted BIO tags. + Parameters + ---------- + X : list(list(str)) + list of tokenized sentences. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. """ if self.model_ is None: raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") @@ -111,8 +127,14 @@ def predict(self, X): def save(self, dirpath): """ Save trained SpaCy NER model at dirpath. - Args: - dirpath (str): path to model directory. + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None """ if self.model_ is None: raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model") @@ -126,10 +148,14 @@ def save(self, dirpath): def load(self, dirpath): """ Load a pre-trained SpaCy NER model from dirpath. - Args: - dirpath (str): path to model directory. - Returns: - this object populated with pre-trained model. + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + self """ if not os.path.exists(dirpath): raise ValueError("Model directory {:s} not found".format(dirpath)) @@ -142,11 +168,15 @@ def load(self, dirpath): def _convert_to_spacy(self, tokens, labels): """ Convert data and labels for single sentence to SpaCy specific format: - Args: - tokens (list(str)): list of tokens. - labels (list(str)): list of BIO tags. + Parameters + ---------- + tokens : list(str) + list of tokens. + labels : list(str) + list of BIO tags. - Returns: + Returns + -------- list of tuples in SpaCy format as shown below: ( "The quick brown fox jumps over the lazy dog", @@ -165,14 +195,17 @@ def _convert_to_spacy(self, tokens, labels): def _convert_from_spacy(self, sent, entities): """ Converts SpaCy predictions to standard form. - Args: - sent (str): the sentence as a string. - entities (list(entities)): a list of SpaCy Entity objects - Entity(start_char, end_char, label_). - - Returns: - predictions (list(str)): a list of BIO tags for a single - sentence. + Parameters + ---------- + sent : str + the sentence as a string. + entities : list(entities) + a list of SpaCy Entity(start_char, end_char, label_) objects. + + Returns + ------- + predictions : list(str) + a list of BIO tags for a single sentence. """ spans = [(e.start_char, e.end_char, e.label_) for e in entities] tokens, tags = spans_to_tokens(sent, spans, self._spacy_lm, From 9652959a92d0e05ed23acaf8f61248525734d0f7 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 16 Dec 2019 18:39:14 -0800 Subject: [PATCH 39/64] updated list of changes --- docs/CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index 02eadcb..b15b456 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -40,10 +40,10 @@ * Miscellaneous * replaced deprecated sklearn.external.joblib -> joblib * True Scikit-Learn interoperability -- moved parameters to constructor. However, `sklearn.utils.check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are `list(list(str))` rather than `list(str)`. + * Docs converted to Numpy Docstring format. ## Planned -* Convert Docs to numpy docstring format -- https://numpydoc.readthedocs.io/en/latest/format.html * BERT Transformer based NER * FLAIR based NER From 008ffb9615bc3742d9b58b1e77d329b4b14c8565 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 08:41:40 -0800 Subject: [PATCH 40/64] documentation fix --- nerds/models/crf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerds/models/crf.py b/nerds/models/crf.py index 0816d0f..69c7aa9 100644 --- a/nerds/models/crf.py +++ b/nerds/models/crf.py @@ -88,7 +88,7 @@ def predict(self, X): Parameters ---------- - X : list(list(dict(str, str)) + X : list(list(str)) list of sentences. Sentences are tokenized into list of words. Returns From 353980896fc71e39d8747cfe32aa083db2f45354 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 08:42:29 -0800 Subject: [PATCH 41/64] integrated FLAIR SequenceTagger (NER) into NERDS. --- nerds/models/__init__.py | 2 + nerds/models/flair.py | 264 +++++++++++++++++++++++++++++++++++ nerds/test/test_flair_ner.py | 22 +++ 3 files changed, 288 insertions(+) create mode 100644 nerds/models/flair.py create mode 100644 nerds/test/test_flair_ner.py diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py index da69407..0b4ed26 100644 --- a/nerds/models/__init__.py +++ b/nerds/models/__init__.py @@ -4,6 +4,7 @@ from nerds.models.spacy import SpacyNER from nerds.models.dictionary import DictionaryNER from nerds.models.elmo import ElmoNER +from nerds.models.flair import FlairNER from nerds.models.ensemble import EnsembleNER __all__ = [ @@ -13,5 +14,6 @@ "SpacyNER", "BiLstmCrfNER", "ElmoNER", + "FlairNER", "EnsembleNER" ] diff --git a/nerds/models/flair.py b/nerds/models/flair.py new file mode 100644 index 0000000..447155d --- /dev/null +++ b/nerds/models/flair.py @@ -0,0 +1,264 @@ +import flair +import os +import torch + +from flair.data import Corpus, Sentence, Token +from flair.embeddings import (CharacterEmbeddings, TokenEmbeddings, + WordEmbeddings, StackedEmbeddings) +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer + +from sklearn.model_selection import train_test_split +from torch.optim import SGD, Adam + +from nerds.models import NERModel +from nerds.utils import get_logger + +log = get_logger() + +class FlairNER(NERModel): + + def __init__(self, + basedir, + hidden_dim=256, + embeddings=None, + use_crf=True, + use_rnn=True, + num_rnn_layers=1, + dropout=0.0, + word_dropout=0.05, + locked_dropout=0.5, + optimizer="sgd", + learning_rate=0.1, + batch_size=32, + max_iter=10): + """ Construct a FLAIR NER. + + Parameters + ---------- + basedir : str + directory where all model artifacts will be written. + hidden_dim : int, optional, default 256 + dimension of RNN hidden layer. + embeddings : flair.embeddings.TokenEmbeddings, optional + if not provided, default embedding used is stacked GloVe + WordEmbeddings and CharacterEmbeddings. + use_crf : bool, default True + if True, CRF decoder layer is used in model, otherwise absent. + use_rnn : bool, default True + if True, RNN layer used after Embeddings, otherwise absent. + dropout : float, optional, default 0.0 + dropout probability. + word_dropout : float, optional, default 0.05 + word dropout probability. + locked_dropout : float, optional, default 0.5 + locked dropout probability. + optimizer : str, optional, default "sgd" + valid values are "sgd" and "adam" + learning_rate : float, optional, default 0.1 + learning rate for (SGD) optimizer. + batch_size : int, optional, default 32 + batch size to use during training. + max_iter : int, optional, default 10 + number of epochs to train. + + Attributes + ---------- + model_ : reference to the underlying flair.models.SequenceTagger model. + """ + super().__init__() + self.basedir = basedir + self.hidden_dim = hidden_dim + self.embeddings = embeddings + self.use_crf = use_crf + self.use_rnn = use_rnn + self.num_rnn_layers = num_rnn_layers + self.dropout = dropout + self.word_dropout = word_dropout + self.locked_dropout = locked_dropout + self.optimizer = optimizer + self.learning_rate = learning_rate + self.batch_size = batch_size + self.max_iter = max_iter + self.model_ = None + + + def fit(self, X, y): + """ Build feature vectors and train CRF model. Wrapper for + sklearn_crfsuite.CRF model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list + of words. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + log.info("Creating FLAIR corpus...") + Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) + sents_train = self._convert_to_flair(Xtrain, ytrain) + sents_val = self._convert_to_flair(Xval, yval) + corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") + + tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") + + if self.embeddings is None: + embedding_types = [ + WordEmbeddings("glove"), + CharacterEmbeddings() + ] + self.embeddings = StackedEmbeddings(embeddings=embedding_types) + + log.info("Building FLAIR NER...") + self.model_ = SequenceTagger(hidden_size=self.hidden_dim, + embeddings=self.embeddings, + tag_dictionary=tag_dict, + tag_type="ner", + use_crf=self.use_crf, + use_rnn=self.use_rnn, + rnn_layers=self.num_rnn_layers, + dropout=self.dropout, + word_dropout=self.word_dropout, + locked_dropout=self.locked_dropout) + + log.info("Training FLAIR NER...") + opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam + trainer = ModelTrainer(self.model_, corpus_train, opt) + trainer.train(base_path=self.basedir, + learning_rate=self.learning_rate, + mini_batch_size=self.batch_size, + max_epochs=self.max_iter) + + return self + + + def predict(self, X): + """ Predicts using trained CRF model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list of words. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") + + log.info("Generating predictions...") + sents_test = self._convert_to_flair(X) + sents_pred = self.model_.predict(sents_test, + mini_batch_size=self.batch_size, + all_tag_prob=True) + _, ypred = self._convert_from_flair(sents_pred) + + return ypred + + + def save(self, dirpath): + """ Save trained SpaCy NER model at dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model.") + + if not(os.path.exists(dirpath) and os.path.isdir(dirpath)): + os.makedirs(dirpath) + self.model_.save(os.path.join(dirpath, "final-model.pt")) + + + def load(self, dirpath): + """ Load a pre-trained SpaCy NER model from dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + self + """ + if not(os.path.exists(dirpath) and os.path.isdir(dirpath)): + raise ValueError("Model directory {:s} not found".format(dirpath)) + + if not os.path.exists(os.path.join(dirpath, "final-model.pt")): + raise ValueError("No model file in directory {:d}".format(dirpath)) + + self.model_ = SequenceTagger.load(os.path.join(dirpath, "final-model.pt")) + + return self + + + def _convert_to_flair(self, data, labels=None): + """ Convert data and labels into a list of flair.data.Sentence objects. + + Parameters + ---------- + data : list(list(str)) + list of list of tokens, each inner list represents a list of + tokens or words in sentence, and each outer list represents + a sentence. + labels : list(list(str)), can be None + list of list of NER tags corresponding to tokens in data. + + Returns + ------- + sentences : list(flair.data.Sentence) + """ + sentences = [] + if labels is None: + labels = data + use_dummy_labels = True + else: + use_dummy_labels = False + for tokens, tags in zip(data, labels): + sentence = Sentence() + for token, tag in zip(tokens, tags): + t = Token(token) + if not use_dummy_labels: + t.add_tag("ner", tag) + sentence.add_token(t) + sentences.append(sentence) + return sentences + + + def _convert_from_flair(self, sentences): + """ Convert a list of flair.data.Sentence objects to parallel lists for + data and label lists. + + Parameters + ---------- + sentences : list(flair.data.Sentence) + list of FLAIR Sentence objects populated with tag predictions. + + Returns + ------- + data : list(list(str)) + list of list of tokens. + labels : list(list(str)) + list of list of tags. + """ + data, labels = [], [] + for sentence in sentences: + tokens = [t.text for t in sentence.tokens] + tags = [t.tags["ner"].value for t in sentence.tokens] + data.append(tokens) + labels.append(tags) + return data, labels diff --git a/nerds/test/test_flair_ner.py b/nerds/test/test_flair_ner.py new file mode 100644 index 0000000..d366f76 --- /dev/null +++ b/nerds/test/test_flair_ner.py @@ -0,0 +1,22 @@ +from nose.tools import assert_equal, assert_true + +from nerds.models import FlairNER +from nerds.utils import load_data_and_labels + +import shutil + +import warnings +warnings.filterwarnings("ignore") + +def test_flair_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = FlairNER("nerds/test/data/models", max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + # FLAIR NER needs more data to train than provided, so pointless testing + # for prediction quality, just make sure prediction produces something sane + assert_equal(len(y), len(y_pred), "Size of Label and prediction must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + shutil.rmtree("nerds/test/data/models") From 78ae769afd5502ad46e2f5764eb96802af56f28f Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 08:43:50 -0800 Subject: [PATCH 42/64] added FLAIR to both examples, updated Ensemble calling (constructor injection) style. --- examples/BioNLP/README.md | 27 ++++++++++++++++++++++----- examples/BioNLP/test_models.py | 30 ++++++++++++++++++++---------- examples/GMB/README.md | 30 +++++++++++++++++++++++++----- examples/GMB/test_models.py | 29 +++++++++++++++++++---------- 4 files changed, 86 insertions(+), 30 deletions(-) diff --git a/examples/BioNLP/README.md b/examples/BioNLP/README.md index 10dc191..d8a6109 100644 --- a/examples/BioNLP/README.md +++ b/examples/BioNLP/README.md @@ -22,7 +22,7 @@ Our example will use the `data/train/Genia4ERtask1.iob2` file for training, and ## Results -### Dictionary NER +### Dictionary NER (from_dictionary=False) ``` precision recall f1-score support @@ -38,7 +38,7 @@ Our example will use the `data/train/Genia4ERtask1.iob2` file for training, and weighted avg 0.70 0.61 0.65 19392 ``` -### CRF NER (max_iterations=100, c1=0.1, c2=0.1) +### CRF NER (c1=0.1, c2=0.1, max_iter=100, featurizer=Default) ``` precision recall f1-score support @@ -54,7 +54,7 @@ weighted avg 0.70 0.61 0.65 19392 weighted avg 0.79 0.76 0.77 19392 ``` -### SpaCy NER (num_epochs=20, dropout=0.1) +### SpaCy NER (dropout=0.1, max_iter=20, batch_size=32) ``` precision recall f1-score support @@ -70,7 +70,7 @@ weighted avg 0.79 0.76 0.77 19392 weighted avg 0.79 0.78 0.78 19392 ``` -### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10) ``` precision recall f1-score support @@ -86,7 +86,7 @@ weighted avg 0.79 0.78 0.78 19392 weighted avg 0.79 0.77 0.78 19392 ``` -### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2) ``` precision recall f1-score support @@ -102,6 +102,23 @@ weighted avg 0.79 0.77 0.78 19392 weighted avg 0.79 0.82 0.80 19392 ``` +### FLAIR NER (hidden_dim=256, embeddings=StackedEmbeddings(WordEmbeddings("glove"), CharEmbeddings()), use_crf=True, use_rnn=True, num_rnn_layers=1, dropout=0.0, word_dropout=0.05, locked_dropout=0.5, optimizer="sgd", learning_rate=0.1, batch_size=32, max_iter=10) + +``` + precision recall f1-score support + + cell_line 0.47 0.75 0.58 1489 + cell_type 0.88 0.63 0.74 4912 + protein 0.82 0.77 0.80 9841 + DNA 0.79 0.76 0.77 2845 + RNA 0.76 0.80 0.78 305 + + micro avg 0.78 0.73 0.76 19392 + macro avg 0.74 0.74 0.73 19392 +weighted avg 0.80 0.73 0.76 19392 + +``` + ### Majority voting ensemble (pre-trained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py index e2a69d0..59c26ec 100644 --- a/examples/BioNLP/test_models.py +++ b/examples/BioNLP/test_models.py @@ -6,7 +6,8 @@ from sklearn.utils import shuffle from nerds.models import ( - DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, ElmoNER, EnsembleNER + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, + ElmoNER, FlairNER, EnsembleNER ) from nerds.utils import * @@ -73,6 +74,17 @@ flatten_list(ypred, strip_prefix=True), labels=entity_labels)) +# train and test the FLAIR NER +model = FlairNER("models/flair_model") +model.fit(xtrain, ytrain) +model.save("models/flair_model") +trained_model = model.load("models/flair_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + + # create and test an ensemble dict_model = DictionaryNER() dict_model.load("models/dict_model") @@ -82,15 +94,13 @@ spacy_model.load("models/spacy_model") bilstm_model = BiLstmCrfNER() bilstm_model.load("models/bilstm_model") -model = EnsembleNER() -model.fit(xtrain, ytrain, - estimators=[ - (dict_model, {}), - (crf_model, {}), - (spacy_model, {}), - (bilstm_model, {}) - ], - is_pretrained=True) +estimators = [ + ("dict_model", dict_model), + ("crf_model", crf_model), + ("spacy_model", spacy_model), + ("bilstm_model", bilstm_model) +] +model = EnsembleNER(estimators=estimators, is_pretrained=True) ypred = model.predict(xtest) print(classification_report(flatten_list(ytest, strip_prefix=True), flatten_list(ypred, strip_prefix=True), diff --git a/examples/GMB/README.md b/examples/GMB/README.md index 27c281d..0b6f54d 100644 --- a/examples/GMB/README.md +++ b/examples/GMB/README.md @@ -23,7 +23,7 @@ We train with the full set of data, and the entire run across all the provided m ## Results -### Dictionary NER +### Dictionary NER (from_dictionary=False) ``` precision recall f1-score support @@ -42,7 +42,7 @@ We train with the full set of data, and the entire run across all the provided m weighted avg 0.48 0.77 0.55 48418 ``` -### CRF NER (max_iterations=100, c1=0.1, c2=0.1) +### CRF NER (c1=0.1, c2=0.1, max_iter=100, featurizer=Default) ``` precision recall f1-score support @@ -64,7 +64,7 @@ weighted avg 0.87 0.85 0.86 48418 The entity types which have enough examples have good results! -### SpaCy NER (num_epochs=20, dropout=0.1) +### SpaCy NER (dropout=0.1, max_iter=20, batch_size=32) ``` precision recall f1-score support @@ -84,7 +84,7 @@ weighted avg 0.87 0.85 0.86 48418 ``` -### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, num_epochs=10) +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10) ``` precision recall f1-score support @@ -104,7 +104,7 @@ weighted avg 0.86 0.85 0.85 48418 ``` -### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, num_epochs=2) +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2) ``` precision recall f1-score support @@ -124,6 +124,26 @@ weighted avg 0.87 0.87 0.86 48418 ``` +### FLAIR NER (hidden_dim=256, embeddings=StackedEmbeddings(WordEmbeddings("glove"), CharEmbeddings()), use_crf=True, use_rnn=True, num_rnn_layers=1, dropout=0.0, word_dropout=0.05, locked_dropout=0.5, optimizer="sgd", learning_rate=0.1, batch_size=32, max_iter=10) + +``` + precision recall f1-score support + + art 0.00 0.00 0.00 215 + eve 0.71 0.20 0.31 169 + geo 0.84 0.91 0.87 13724 + gpe 0.95 0.92 0.94 4850 + nat 0.50 0.06 0.11 94 + org 0.85 0.67 0.75 10884 + per 0.84 0.92 0.88 10342 + tim 0.90 0.88 0.89 8140 + + micro avg 0.86 0.84 0.85 48418 + macro avg 0.70 0.57 0.59 48418 +weighted avg 0.86 0.84 0.85 48418 + +``` + ### Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py index 1429aea..d76ed69 100644 --- a/examples/GMB/test_models.py +++ b/examples/GMB/test_models.py @@ -7,7 +7,8 @@ from sklearn.utils import shuffle from nerds.models import ( - DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, ElmoNER, EnsembleNER + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, + ElmoNER, FlairNER, EnsembleNER ) from nerds.utils import * @@ -105,6 +106,16 @@ def convert_to_iob_format(input_file, output_file): flatten_list(ypred, strip_prefix=True), labels=entity_labels)) +# train and test the FLAIR NER +model = FlairNER("models/flair_model") +model.fit(xtrain, ytrain) +model.save("models/flair_model") +trained_model = model.load("models/flair_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + # create and test an ensemble dict_model = DictionaryNER() dict_model.load("models/dict_model") @@ -114,15 +125,13 @@ def convert_to_iob_format(input_file, output_file): spacy_model.load("models/spacy_model") bilstm_model = BiLstmCrfNER() bilstm_model.load("models/bilstm_model") -model = EnsembleNER() -model.fit(xtrain, ytrain, - estimators=[ - (dict_model, {}), - (crf_model, {}), - (spacy_model, {}), - (bilstm_model, {}) - ], - is_pretrained=True) +estimators = [ + ("dict_model", dict_model), + ("crf_model", crf_model), + ("spacy_model", spacy_model), + ("bilstm_model", bilstm_model) +] +model = EnsembleNER(estimators=estimators, is_pretrained=True) ypred = model.predict(xtest) print(classification_report(flatten_list(ytest, strip_prefix=True), flatten_list(ypred, strip_prefix=True), From 744622ec90a4bc53c90a636171b902c95de346b5 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 08:56:39 -0800 Subject: [PATCH 43/64] documentation fixes --- nerds/models/flair.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nerds/models/flair.py b/nerds/models/flair.py index 447155d..aa5a442 100644 --- a/nerds/models/flair.py +++ b/nerds/models/flair.py @@ -84,8 +84,7 @@ def __init__(self, def fit(self, X, y): - """ Build feature vectors and train CRF model. Wrapper for - sklearn_crfsuite.CRF model. + """ Build feature vectors and train FLAIR model. Parameters ---------- @@ -138,7 +137,7 @@ def fit(self, X, y): def predict(self, X): - """ Predicts using trained CRF model. + """ Predicts using trained FLAIR model. Parameters ---------- From 396fd6f471b1e91f04b7845b52346f0001cf13dd Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 08:57:32 -0800 Subject: [PATCH 44/64] documentation fixes --- nerds/models/flair.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nerds/models/flair.py b/nerds/models/flair.py index aa5a442..2eedf37 100644 --- a/nerds/models/flair.py +++ b/nerds/models/flair.py @@ -163,7 +163,7 @@ def predict(self, X): def save(self, dirpath): - """ Save trained SpaCy NER model at dirpath. + """ Save trained FLAIR NER model at dirpath. Parameters ---------- @@ -183,7 +183,7 @@ def save(self, dirpath): def load(self, dirpath): - """ Load a pre-trained SpaCy NER model from dirpath. + """ Load a pre-trained FLAIR NER model from dirpath. Parameters ---------- From e0f47c0fbdd1791bff6285af1efc49bc2404683c Mon Sep 17 00:00:00 2001 From: sujitpal Date: Mon, 23 Dec 2019 10:14:24 -0800 Subject: [PATCH 45/64] params.yaml file creation in save(). --- nerds/models/bilstm.py | 4 +- nerds/models/crf.py | 4 +- nerds/models/dictionary.py | 4 +- nerds/models/elmo.py | 4 +- nerds/models/ensemble.py | 8 +- nerds/models/flair.py | 6 +- nerds/models/spacy.py | 5 +- nerds/test/test_utils.py | 14 ++++ nerds/utils.py | 159 +++++++++++++++++++++++++------------ 9 files changed, 146 insertions(+), 62 deletions(-) diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py index 72219af..5d7e0c0 100644 --- a/nerds/models/bilstm.py +++ b/nerds/models/bilstm.py @@ -6,7 +6,7 @@ from keras.optimizers import Adam from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, write_param_file from sklearn.model_selection import train_test_split @@ -183,6 +183,8 @@ def save(self, dirpath): save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath diff --git a/nerds/models/crf.py b/nerds/models/crf.py index 69c7aa9..5c03f61 100644 --- a/nerds/models/crf.py +++ b/nerds/models/crf.py @@ -1,5 +1,5 @@ from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, write_param_file import os import joblib @@ -128,6 +128,8 @@ def save(self, dirpath): model_file = os.path.join(dirpath, "crf-model.pkl") joblib.dump(self.model_, model_file) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + def load(self, dirpath): """ Load a pre-trained CRF model from dirpath. diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py index 70ee863..d7020f5 100644 --- a/nerds/models/dictionary.py +++ b/nerds/models/dictionary.py @@ -1,5 +1,5 @@ from nerds.models import NERModel -from nerds.utils import get_logger, spans_to_tokens +from nerds.utils import get_logger, spans_to_tokens, write_param_file import ahocorasick import joblib @@ -118,6 +118,8 @@ def save(self, dirpath=None): model_file = os.path.join(dirpath, "dictionary-ner.pkl") joblib.dump(self.model_, model_file) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + def load(self, dirpath=None): """ Loads model from disk from dirpath. diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py index c69cdf1..2185ae1 100644 --- a/nerds/models/elmo.py +++ b/nerds/models/elmo.py @@ -7,7 +7,7 @@ from keras.optimizers import Adam from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, write_param_file from sklearn.model_selection import train_test_split @@ -193,6 +193,8 @@ def save(self, dirpath): save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py index 5efdba7..1819e52 100644 --- a/nerds/models/ensemble.py +++ b/nerds/models/ensemble.py @@ -1,5 +1,5 @@ from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, write_param_file from sklearn.preprocessing import LabelEncoder @@ -105,12 +105,12 @@ def predict(self, X): return self._vote(predictions) - def load(model_dirpath): + def load(dirpath): raise NotImplementedError() - def save(model_dirpath): - raise NotImplementedError() + def save(dirpath): + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def _fit_estimator(self, estimator, X, y): diff --git a/nerds/models/flair.py b/nerds/models/flair.py index 2eedf37..067d2dd 100644 --- a/nerds/models/flair.py +++ b/nerds/models/flair.py @@ -12,7 +12,7 @@ from torch.optim import SGD, Adam from nerds.models import NERModel -from nerds.utils import get_logger +from nerds.utils import get_logger, write_param_file log = get_logger() @@ -179,7 +179,9 @@ def save(self, dirpath): if not(os.path.exists(dirpath) and os.path.isdir(dirpath)): os.makedirs(dirpath) - self.model_.save(os.path.join(dirpath, "final-model.pt")) + self.model_.save(os.path.join(dirpath, "final-model.pt")) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py index 8540490..15b7491 100644 --- a/nerds/models/spacy.py +++ b/nerds/models/spacy.py @@ -1,5 +1,6 @@ from nerds.models import NERModel -from nerds.utils import get_logger, spans_to_tokens, tokens_to_spans +from nerds.utils import (get_logger, write_param_file, + spans_to_tokens, tokens_to_spans) from spacy.util import minibatch @@ -144,6 +145,8 @@ def save(self, dirpath): os.makedirs(dirpath) self.model_.to_disk(dirpath) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + def load(self, dirpath): """ Load a pre-trained SpaCy NER model from dirpath. diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index c1f2947..aa74590 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -1,7 +1,9 @@ from nose.tools import assert_equal, assert_true from nerds.utils import * +from nerds.models import CrfNER +import os import spacy spacy_lm = spacy.load("en") @@ -13,6 +15,18 @@ def test_load_data_and_labels(): assert_equal(len(X[0]), len(y[0]), "Number of tokens should be equal to number of tags") +def test_write_param_file(): + model = CrfNER() + param_filepath = "nerds/test/data/crf_params.yaml" + write_param_file(model.get_params(), param_filepath) + lines = [] + with open(param_filepath, "r") as fp: + for line in fp: + lines.append(line.strip()) + assert_equal(4, len(lines)) + os.remove(param_filepath) + + def test_flatten_and_unflatten_list(): X, y = load_data_and_labels("nerds/test/data/example.iob") yflat = flatten_list(y, strip_prefix=True, capture_lengths=True) diff --git a/nerds/utils.py b/nerds/utils.py index d9e8bfb..0a7093c 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -1,6 +1,8 @@ import anago import itertools import logging +import os +import yaml def get_logger(log_level="DEBUG"): @@ -19,29 +21,59 @@ def load_data_and_labels(filepath): a wrapper because users of non-neural models are not expected to be familiar with Anago. - Args: - filepath (str): path to the file in BIO format to be loaded. + Parameters + ---------- + filepath : str + path to the file in BIO format to be loaded. - Returns: - x (list(str)): list of tokens. - y (list(str)): list of tags. + Returns + ------- + x : list(list(str)) + list of list of tokens, where list of tokens represent sentences. + y : list(str)) + list of list of tags. """ return anago.utils.load_data_and_labels(filepath) +def write_param_file(param_dict, param_filepath): + """ Write configured model hyperparameters to file for documentation. + + Parameters + ---------- + param_dict : dict(str, obj) + Flat dictionary of constructor parameter names and values + generated by cls.get_params() (from sklearn.base.BaseEstimator). + param_filepath : str + Full path to the parameter file. + + Returns + ------- + None + """ + param_dirpath = os.path.dirname(param_filepath) + if not os.path.isdir(param_dirpath): + os.makedirs(param_dirpath) + with open(param_filepath, "w") as fp: + fp.write(yaml.dump(param_dict)) + + def flatten_list(xs, strip_prefix=True, capture_lengths=False): """ Flatten label or predictions from list(list(str)) to list(str). Flattened list can be input to scikit-learn's standard functions to compute various metrics. - Args: - xs (list(list(str))): list of list of tags (inner list is sentence). - strip_prefix (bool): if True, remove leading I- and B-, else retain. - - Returns: - xs_flat list(str): the flattened list. - xs_lengths list(int) or None: a list of lengths of the inner list(str) - of the input xs. + Parameters + ---------- + xs : list(list(str)) + list of list of tags (inner list is sentence). + strip_prefix : bool + if True, remove leading I- and B-, else retain. + + Returns + ------- + xs_flat : list(str) + the flattened list. """ def strip_bio_prefix(label): return label.split('-')[-1] @@ -54,13 +86,19 @@ def strip_bio_prefix(label): def compute_list_lengths(xs): """ Convenience method to return a list of ints representing lengths of - inner lists in xs. - - Args: - xs (list(list(str))): list of list of tags. + inner lists in xs. Meant to be used in conjunction with flatten_list + to capture the original sentence lengths, so flattened list(str) can + be restored to list(list(str)) via unflatten_list. + + Parameters + ---------- + xs : list(list(str)) + list of list of tags. - Returns: - xs_lengths (list(int)): list of lengths of inner list. + Returns + ------- + xs_lengths : list(int) + list of lengths of inner list. """ return [len(x) for x in xs] @@ -69,12 +107,17 @@ def unflatten_list(xs_flat, xs_lengths): """ Reverse operation of flatten_list. Using the flattened list and the list of list lengths of the inner list, reconstructs original list(list(str)). - Args: - xs_flat list(str): the flattened list. - xs_lengths list(int): list of inner list to group by. - - Returns: - xs_unflat list(list(str)): original list of list(list(str)) + Parameters + ---------- + xs_flat : list(str) + the flattened list. + xs_lengths : list(int) + list of inner list to group by. + + Returns + ------- + xs_unflat : list(list(str)) + original list of list(list(str)) """ xs_unflat = [] start = 0 @@ -89,18 +132,25 @@ def tokens_to_spans(tokens, tags, allow_multiword_spans=True): """ Convert from tokens-tags format to sentence-span format. Some NERs use the sentence-span format, so we need to transform back and forth. - Args: - tokens (list(str)): list of tokens representing single sentence. - tags (list(str)): list of tags in BIO format. - allow_multiword_spans (bool): if True, offsets for consecutive - tokens of the same entity type are merged into a single span, - otherwise tokens are reported as individual spans. - - Returns: - sentence (str): the sentence as a string. - spans (list((int, int, str))): list of spans as a 3-tuple of start - position, end position, and entity type. Note that end position - is 1 beyond the actual ending position of the token. + Parameters + ---------- + tokens : list(str) + list of tokens representing single sentence. + tags : list(str) + list of tags in BIO format. + allow_multiword_spans : bool + if True, offsets for consecutive tokens of the same entity type are + merged into a single span, otherwise tokens are reported as individual + spans. + + Returns + ------- + sentence : str + the sentence as a string. + spans : list((int, int, str)) + list of spans as a 3-tuple of start position, end position, and entity + type. Note that end position is 1 beyond the actual ending position of + the token. """ spans = [] curr, start, end, ent_cls = 0, None, None, None @@ -139,20 +189,27 @@ def spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True): """ Convert from sentence-spans format to tokens-tags format. Some NERs use the sentence-spans format, so we need to transform back and forth. - Args: - sentence (str): the sentence as a string. - spans (list((int, int, str))): list of spans as a 3-tuple of - start_position, end_position, and entity_type. Note that end - position is 1 beyond actual end position of the token. - spacy_lm: we use SpaCy EN language model to tokenizing the - sentence to generate list of tokens. - spans_are_multiword (bool): if True, indicates that spans can - be multi-word spans), so consecutive entries of the same class - should be transformed, ie. (B-x, B-x) should become (B-x, I-x). - - Returns: - tokens (list(str)): list of tokens in sentence - tags (list(str)): list of tags in BIO format. + Parameters + ---------- + sentence : str + the sentence as a string. + spans : list((int, int, str)) + list of spans as a 3-tuple of start_position, end_position, and + entity_type. Note that end position is 1 beyond actual end position + of the token. + spacy_lm: we use SpaCy EN language model to tokenizing the sentence to + generate list of tokens. + spans_are_multiword : bool + if True, indicates that spans can be multi-word spans), so consecutive + entries of the same class should be transformed, ie. (B-x, B-x) should + become (B-x, I-x). + + Returns + ------- + tokens : list(str) + list of tokens in sentence + tags : list(str) + list of tags in BIO format. """ tokens, tags = [], [] curr_start, curr_end = 0, 0 From c3ba6fbd01a0d7af2e1c4e30c596113c5326f042 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Sat, 28 Dec 2019 08:23:03 -0800 Subject: [PATCH 46/64] Flair NER additions --- docs/CHANGES.md | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index b15b456..006094f 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -27,6 +27,8 @@ * does not give timestep size errors * ElmoNER * New, available in Anago DEV repo, same API as Anago's BiLSTMCRF + * FLAIR based NER + * New, incorporated from the [Zalando Flair project](https://github.com/flairNLP/flair). * EnsembleNER * simpler interface * weights from each classifier @@ -45,5 +47,4 @@ ## Planned * BERT Transformer based NER -* FLAIR based NER diff --git a/setup.py b/setup.py index 296bce6..4d5ba23 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ install_requires=[ 'allennlp', 'anago @ git+https://github.com/Hironsan/anago.git', + 'flair', 'future', 'h5py', 'hyperopt', From aa96dd4ce212c3dff55b29b4d9491b2acfe62e92 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 10 Jan 2020 18:58:12 -0800 Subject: [PATCH 47/64] added link to blog post on flair based NER --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 62ed01f..3828e36 100644 --- a/README.md +++ b/README.md @@ -134,3 +134,5 @@ The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that w # Talks and Blogs * \[slides\] [Slides for talk at PyData LA 2019](https://www.slideshare.net/sujitpal/building-named-entity-recognition-models-efficiently-using-nerds). +* \[video\] [Video of talk at PyData LA 2019](https://www.youtube.com/watch?v=ilzFiK0nAh8). +* \[blog\] [Incorporating third party NER (Flair) into NERDS](https://sujitpal.blogspot.com/2019/12/incorporating-flair-ner-into-nerds.html). From c2751f8db74667518ee081c545aca2d500d44ccd Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 10 Jan 2020 18:58:45 -0800 Subject: [PATCH 48/64] updated link to BioNLP shared task --- examples/BioNLP/data_prep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/BioNLP/data_prep.sh b/examples/BioNLP/data_prep.sh index 0b71e7c..7d8bdb9 100755 --- a/examples/BioNLP/data_prep.sh +++ b/examples/BioNLP/data_prep.sh @@ -6,13 +6,13 @@ mkdir train test echo "Downloading training data..." cd train -curl -O http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtraining.tar.gz +curl -O http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz tar xvf Genia4ERtraining.tar.gz rm Genia4ERtraining.tar.gz echo "Downloading test data..." cd ../test -curl -O http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtest.tar.gz +curl -O http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz tar xvf Genia4ERtest.tar.gz rm Genia4ERtest.tar.gz From 49b85110b2469f27651cf8c311102f84156dd1c8 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 10 Jan 2020 18:59:23 -0800 Subject: [PATCH 49/64] documentation change --- nerds/models/bilstm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py index 5d7e0c0..187ff72 100644 --- a/nerds/models/bilstm.py +++ b/nerds/models/bilstm.py @@ -88,7 +88,7 @@ def __init__(self, def fit(self, X, y): - """ Trains the NER model. Input is list of AnnotatedDocuments. + """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- From b7bfdc3d1baa601fa89c938e32d31ec1b2767904 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 10 Jan 2020 19:02:06 -0800 Subject: [PATCH 50/64] updated link to data --- examples/BioNLP/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/BioNLP/README.md b/examples/BioNLP/README.md index d8a6109..aa32330 100644 --- a/examples/BioNLP/README.md +++ b/examples/BioNLP/README.md @@ -1,6 +1,6 @@ # Dataset description -Data comes from the [Report on Bio-Entity Recognition Task at BioNLP/NLPBA 2004](http://www.nactem.ac.uk/tsujii/GENIA/ERtask/report.html) page. The page describes the provenance and characteristics of the data. +Data comes from the [GENIA Project page for BioNLP / JNLPBA Shared Task 2004](http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004). The page describes the provenance and characteristics of the data. In addition, [GloVe (Global Vectors for Word Representation) vectors](https://nlp.stanford.edu/projects/glove/) are needed to run the ElmoNER model. From 9f2109534322865c580bc64380bcaac37591e396 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 15 Jan 2020 18:21:45 -0800 Subject: [PATCH 51/64] rename tests to reflect intent (fix copy-paste) --- nerds/test/test_bilstm_ner.py | 2 +- nerds/test/test_elmo_ner.py | 2 +- nerds/test/test_flair_ner.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py index 932fba0..a945ba1 100644 --- a/nerds/test/test_bilstm_ner.py +++ b/nerds/test/test_bilstm_ner.py @@ -5,7 +5,7 @@ import shutil -def test_crf_ner(): +def test_bilstm_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = BiLstmCrfNER(max_iter=1) model.fit(X, y) diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py index 2ee8bf4..ee8a0b7 100644 --- a/nerds/test/test_elmo_ner.py +++ b/nerds/test/test_elmo_ner.py @@ -6,7 +6,7 @@ import numpy as np import shutil -def test_crf_ner(): +def test_elmo_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") # there are 28 unique words in our "vocabulary" embeddings = np.random.random((28, 100)) diff --git a/nerds/test/test_flair_ner.py b/nerds/test/test_flair_ner.py index d366f76..44772d8 100644 --- a/nerds/test/test_flair_ner.py +++ b/nerds/test/test_flair_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import FlairNER @@ -5,9 +8,6 @@ import shutil -import warnings -warnings.filterwarnings("ignore") - def test_flair_ner(): X, y = load_data_and_labels("nerds/test/data/example.iob") model = FlairNER("nerds/test/data/models", max_iter=1) From dff1a70ac2e2e64462c818c53a3657666ca1b47f Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 15 Jan 2020 18:22:31 -0800 Subject: [PATCH 52/64] add encoding to load_data_and_labels --- nerds/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nerds/utils.py b/nerds/utils.py index 0a7093c..120d331 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -16,7 +16,7 @@ def get_logger(log_level="DEBUG"): return logger -def load_data_and_labels(filepath): +def load_data_and_labels(filepath, encoding="utf-8"): """ Wrapper to expose anago's load_data_and_labels. Built here as a wrapper because users of non-neural models are not expected to be familiar with Anago. @@ -25,7 +25,10 @@ def load_data_and_labels(filepath): ---------- filepath : str path to the file in BIO format to be loaded. - + encoding : str, default utf-8 + a standard python encodings, see: + https://docs.python.org/2.4/lib/standard-encodings.html + Returns ------- x : list(list(str)) @@ -33,7 +36,7 @@ def load_data_and_labels(filepath): y : list(str)) list of list of tags. """ - return anago.utils.load_data_and_labels(filepath) + return anago.utils.load_data_and_labels(filepath, encoding) def write_param_file(param_dict, param_filepath): From 5d90712f2d4b761d95a01d0bd412200640d1ed70 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Wed, 15 Jan 2020 18:23:01 -0800 Subject: [PATCH 53/64] initial revision --- examples/BioNLP/test_models.py | 11 +- examples/GMB/test_models.py | 16 +- nerds/models/__init__.py | 2 + nerds/models/bert.py | 506 +++++++++++++++++++++++++++++++++ nerds/test/test_bert_ner.py | 22 ++ 5 files changed, 554 insertions(+), 3 deletions(-) create mode 100644 nerds/models/bert.py create mode 100644 nerds/test/test_bert_ner.py diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py index 59c26ec..62d54af 100644 --- a/examples/BioNLP/test_models.py +++ b/examples/BioNLP/test_models.py @@ -7,7 +7,7 @@ from nerds.models import ( DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, - ElmoNER, FlairNER, EnsembleNER + ElmoNER, FlairNER, BertNER, EnsembleNER ) from nerds.utils import * @@ -84,6 +84,15 @@ flatten_list(ypred, strip_prefix=True), labels=entity_labels)) +# train and test the BERT NER +model = BertNER(max_sequence_length=256) +model.fit(xtrain, ytrain) +model.save("models/bert_ner") +trained_model = model.load("models/bert_ner") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) # create and test an ensemble dict_model = DictionaryNER() diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py index d76ed69..6fe36f5 100644 --- a/examples/GMB/test_models.py +++ b/examples/GMB/test_models.py @@ -8,10 +8,11 @@ from nerds.models import ( DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, - ElmoNER, FlairNER, EnsembleNER + ElmoNER, FlairNER, BertNER, EnsembleNER ) from nerds.utils import * + def convert_to_iob_format(input_file, output_file): num_written = 0 fout = open(output_file, "w") @@ -31,7 +32,8 @@ def convert_to_iob_format(input_file, output_file): fout.write("\n") fout.close() -# convert Kaggle dataset to our standard IOB format + +# convert GMB dataset to our standard IOB format if not os.path.exists("train.iob"): convert_to_iob_format("train.csv", "train.iob") @@ -116,6 +118,16 @@ def convert_to_iob_format(input_file, output_file): flatten_list(ypred, strip_prefix=True), labels=entity_labels)) +# train and test the BERT NER +model = BertNER() +model.fit(xtrain, ytrain) +model.save("models/bert_model") +trained_model = model.load("models/bert_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + # create and test an ensemble dict_model = DictionaryNER() dict_model.load("models/dict_model") diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py index 0b4ed26..4388d2f 100644 --- a/nerds/models/__init__.py +++ b/nerds/models/__init__.py @@ -5,6 +5,7 @@ from nerds.models.dictionary import DictionaryNER from nerds.models.elmo import ElmoNER from nerds.models.flair import FlairNER +from nerds.models.bert import BertNER from nerds.models.ensemble import EnsembleNER __all__ = [ @@ -15,5 +16,6 @@ "BiLstmCrfNER", "ElmoNER", "FlairNER", + "BertNER", "EnsembleNER" ] diff --git a/nerds/models/bert.py b/nerds/models/bert.py new file mode 100644 index 0000000..c70b3c6 --- /dev/null +++ b/nerds/models/bert.py @@ -0,0 +1,506 @@ +import joblib +import numpy as np +import os +import random +import time +import torch + +from nerds.models import NERModel +from nerds.utils import flatten_list, get_logger, write_param_file + +from transformers import AdamW +from transformers import BertForTokenClassification, BertTokenizer +from transformers import get_linear_schedule_with_warmup + +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) + +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + + +log = get_logger() + +class BertNER(NERModel): + + def __init__(self, + lang_model="bert-base-cased", + max_sequence_length=128, + learning_rate=2e-5, + batch_size=32, + max_iter=4, + verbose=False, + random_state=42): + """ Construct a BERT NER model. Uses a pretrained BERT language model + and a Fine tuning model for NER is provided by the HuggingFace + transformers library. + + Parameters + ---------- + lang_model : str, optional, default "bert-base-cased" + pre-trained BERT language model to use. + max_sequence_length : int, optional, default 128 + maximum sequence length in tokens for input sentences. Shorter + sentences will be right padded and longer sentences will be + truncated. + learning_rate : float, optional, default 2e-5 + learning rate for the ADAM optimizer. + batch_size : int, optional, default 32 + batch size to use for training. + max_iter : int, default 4 + number of epochs to fine tune. + verbose : bool, optional, default False + whether to display log messages on console. + random_state : int, optional, default 42 + random state to set for repeatable results + + Attributes + ---------- + model_ : reference to underlying BertForTokenClassification object. + tokenizer_ : reference to underlying BertTokenizer object. + label2id_ : mapping from string labels to internal int ids. + id2label_ : mapping from internal int label ids to string labels. + train_losses_ : list(float) of per epoch training losses. + val_accs_ : list(float) of per epoch validation accuracies. + special_tokens_ : set of tokenizer special tokens. + """ + super().__init__() + # model parameters + self.lang_model = lang_model + self.max_sequence_length = max_sequence_length + self.learning_rate = learning_rate + self.batch_size = batch_size + self.max_iter = max_iter + self.verbose = verbose + self.random_state = random_state + self._set_random_state(random_state) + # model attributes + self.model_ = None + self.tokenizer_ = None + self.label2id_ = None + self.id2label_ = None + self.train_losses_ = None + self.val_accs_ = None + self.special_tokens_ = None + # hidden variables + self._pad_label_id = torch.nn.CrossEntropyLoss().ignore_index + self._device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + + def fit(self, X, y): + """ Trains the NER model. Input is list of list of tokens and tags. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + log.info("Converting data and labels to features...") + self.label2id_, self.id2label_ = self._build_label_id_mappings(y) + + Xtrain, Xval, ytrain, yval = train_test_split( + X, y, test_size=0.1, random_state=self.random_state) + + self.tokenizer_ = BertTokenizer.from_pretrained( + self.lang_model, do_basic_tokenize=False) + self.special_tokens_ = set([ + self.tokenizer_.pad_token, self.tokenizer_.unk_token, + self.tokenizer_.sep_token, self.tokenizer_.cls_token]) + + train_feats = self._data_labels_to_features(Xtrain, ytrain) + train_loader = self._create_dataloader(train_feats, "random") + val_feats = self._data_labels_to_features(Xval, yval) + val_loader = self._create_dataloader(val_feats, "sequential") + + log.info("Building model...") + self.model_ = BertForTokenClassification.from_pretrained(self.lang_model, + num_labels=len(self.label2id_), + output_attentions=False, + output_hidden_states=False) + self.model_.to(self._device) + + total_steps = len(train_loader) * self.max_iter + optimizer = AdamW(self.model_.parameters(), lr=self.learning_rate, eps=1e-8) + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps=0, num_training_steps=total_steps) + + self.train_losses_, self.val_accs_ = [], [] + for epoch in range(self.max_iter): + + log.info("==== Epoch {:d}/{:d}".format(epoch + 1, self.max_iter)) + log.info("Training...") + t0 = time.time() + total_loss = 0 + self.model_.train() + + for step, batch in enumerate(train_loader): + if step % 100 == 0: + elapsed = time.time() - t0 + log.info(" Batch {:d} of {:d}, elapsed: {:.3f}s".format( + step + 1, len(train_loader), elapsed)) + b_input_ids = batch[0].to(self._device) + b_attention_mask = batch[1].to(self._device) + b_token_type_ids = batch[2].to(self._device) + b_label_ids = batch[3].to(self._device) + + self.model_.zero_grad() + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids, + labels=b_label_ids) + + loss = outputs[0] + total_loss += loss.item() + loss.backward() + + torch.nn.utils.clip_grad_norm_(self.model_.parameters(), 1.0) + optimizer.step() + scheduler.step() + + avg_train_loss = total_loss / len(train_loader) + self.train_losses_.append(avg_train_loss) + + log.info(" Average training loss: {:.3f}".format(avg_train_loss)) + log.info(" Training epoch took: {:.3f}s".format(time.time() - t0)) + + log.info("Validation...") + t0 = time.time() + self.model_.eval() + + val_acc, val_steps = 0, 0 + for batch in val_loader: + batch = tuple(b.to(self._device) for b in batch) + b_input_ids, b_attention_mask, b_token_type_ids, b_label_ids, _ = batch + with torch.no_grad(): + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids) + logits = outputs[0].detach().cpu().numpy() + b_preds = np.argmax(logits, axis=-1).flatten() + b_labels = b_label_ids.detach().cpu().numpy().flatten() + b_val_acc = accuracy_score(b_preds, b_labels) + val_acc += b_val_acc + val_steps += 1 + + val_acc = val_acc / val_steps + + self.val_accs_.append(val_acc) + log.info(" Accuracy: {:.3f}".format(val_acc)) + log.info(" Validation took {:.3f}s".format(time.time() - t0)) + + log.info("==== Training complete ====") + return self + + + def predict(self, X): + """ Predicts using the NER model. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None or self.tokenizer_ is None: + raise ValueError("No model and/or tokenizer found, either run fit() to train or load() to load a trained model.") + + log.info("Converting data to features...") + test_feats = self._data_labels_to_features(X, None) + test_loader = self._create_dataloader(test_feats, "sequential") + + log.info("Predicting from model...") + predictions = [] + self.model_.eval() + for batch in test_loader: + batch = tuple(b.to(self._device) for b in batch) + b_input_ids, b_attention_mask, b_token_type_ids, b_ids = batch + with torch.no_grad(): + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids) + logits = outputs[0].detach().cpu().numpy() + b_pred_ids = np.argmax(logits, axis=-1) + b_ids = b_ids.detach().cpu().numpy() + b_id_min, b_id_max = b_ids[0], b_ids[-1] + b_X = X[b_id_min : b_id_max + 1] + predictions.extend(self._align_predictions(b_X, b_pred_ids)) + + return predictions + + + def save(self, dirpath): + """ Saves model and related artifacts to specified folder on disk + + Parameters + ---------- + dirpath : str + a directory where model artifacts are to be saved. Artifacts for + this NER are the HuggingFace model and tokenizer, a pickled file + containing the label-to-id and id-to-label mappings, and the NER + configuration YAML file. + + Returns + ------- + None + """ + if self.model_ is None or self.tokenizer_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() pretrained model.") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + self.model_.save_pretrained(dirpath) + self.tokenizer_.save_pretrained(dirpath) + label_map = { + "label2id": self.label2id_, + "id2label": self.id2label_, + "special_tokens": self.special_tokens_ + } + joblib.dump(label_map, os.path.join(dirpath, "label_mappings.pkl")) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Loads a trained model from specified folder on disk. + + Parameters + ---------- + dirpath : str + directory from which model artifacts should be loaded + + Returns + ------- + self + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + label_mappings = joblib.load(os.path.join(dirpath, "label_mappings.pkl")) + self.label2id_ = label_mappings["label2id"] + self.id2label_ = label_mappings["id2label"] + self.special_tokens_ = label_mappings["special_tokens"] + self.model_ = BertForTokenClassification.from_pretrained(dirpath, + num_labels=len(self.label2id_), + output_attentions=False, + output_hidden_states=False) + self.model_.to(self._device) + self.tokenizer_ = BertTokenizer.from_pretrained(dirpath, + do_basic_tokenize=False) + + return self + + + def _set_random_state(self, seed): + """ Set the random seed for reproducible results. + + Parameters + ---------- + seed : int + a numeric random seed. + + Returns + ------- + None + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + + def _build_label_id_mappings(self, labels): + """ Build label (string) to label_id (int) mappings + + Parameters + ---------- + labels : list(list(str)) + labels as provided by the utils.load_data_and_labels() function. + + Returns + ------- + label2id, id2label + """ + label2id = {l:i for i, l in + enumerate(sorted(list(set(flatten_list(labels, strip_prefix=False)))))} + id2label = {v:k for k, v in label2id.items()} + return label2id, id2label + + + def _data_labels_to_features(self, data, labels): + """ Convert data and labels from utils.load_data_and_labels() function + to list of features needed by the HuggingFace BertForTokenClassification + object. + + Parameters + ---------- + data : list(list(str)) + list of list of input tokens + labels : list(list(str)), can be None. + list of list of input BIO tags. + + Returns + ------- + input_ids : list(list(int)) + list of zero-padded fixed length token_ids. + attention_mask : list(list(int)) + mask to avoid performing attention on padding tokens. + token_type_ids : list(list(int)) + segment token indices, all zero since we consider single sequence. + label_ids : list(list(int)) or None + list of zero-padded fixed length label_ids. Set to None if + labels parameter is None. + """ + input_ids, attention_mask, token_type_ids, label_ids = [], [], [], [] + # if labels is None (not supplied), then replace with pseudo labels + labels_supplied = True + if labels is None: + labels_supplied = False + labels = [] + for tokens in data: + labels.append(["O"] * len(tokens)) + + # input is (list(list(str)), list(list(str))) + # format of input is: [CLS] sentence [SEP] + for i, (tokens, tags) in enumerate(zip(data, labels)): + tokens_sent, tags_sent = [], [] + for token, tag in zip(tokens, tags): + subwords = self.tokenizer_.tokenize(token) + if len(subwords) == 0: + tokens_sent.append(token) + else: + tokens_sent.extend(subwords) + tags_sent.append(self.label2id_[tag]) + if len(subwords) > 1: + # repeat tag for all subwords following the specified word, see + # https://github.com/google-research/bert/issues/646#issuecomment-519868110 + if tag.startswith("B-"): + tag = tag.replace("B-", "I-") + tags_sent.extend([self.label2id_[tag]] * (len(subwords) - 1)) + + # truncate to max_sequence_length - 2 (account for special tokens CLS and SEP) + tokens_sent = tokens_sent[0:self.max_sequence_length - 2] + tags_sent = tags_sent[0:self.max_sequence_length - 2] + + # pad upto the max_sequence_length - 2 (account for special tokens CLS and SEP) + tokens_to_pad = self.max_sequence_length - (len(tokens_sent) + 2) + tokens_sent.extend([self.tokenizer_.pad_token] * tokens_to_pad) + tags_sent.extend([self._pad_label_id] * tokens_to_pad) + + # add in [CLS] and [SEP] for tags_sent and tokens_sent + input_tokens = self.tokenizer_.build_inputs_with_special_tokens(tokens_sent) + input_tags = [self._pad_label_id] + tags_sent + [self._pad_label_id] + + # feature: input_ids + input_ids.append(self.tokenizer_.convert_tokens_to_ids(input_tokens)) + # feature: attention_mask + attention_mask.append([0 if t == self.tokenizer_.pad_token else 1 for t in input_tokens]) + # feature: token_type_ids + token_type_ids.append( + self.tokenizer_.create_token_type_ids_from_sequences(tokens_sent)) + # feature: label_ids + label_ids.append(input_tags) + + if self.verbose and i < 5: + log.info("row[{:d}].features:".format(i)) + log.info(" input_tokens:", input_tokens) + log.info(" input_ids:", input_ids[i]) + log.info(" attention_mask:", attention_mask[i]) + log.info(" token_type_ids:", token_type_ids[i]) + log.info(" label_ids:", label_ids[i]) + + if labels_supplied: + return input_ids, attention_mask, token_type_ids, label_ids + else: + return input_ids, attention_mask, token_type_ids, None + + + def _create_dataloader(self, features, sampling): + """ Converts features to Torch DataLoader for different data splits. + + Parameters + ---------- + features : (input_ids, attention_mask, token_type_ids, label_ids) + sampling : "random" (training) or "sequential" (everything else) + + Returns + ------- + dataloader : reference to Torch DataLoader. + """ + input_ids, attention_mask, token_type_ids, label_ids = features + # convert to torch tensors + input_ids_t = torch.tensor(input_ids, dtype=torch.long) + attention_mask_t = torch.tensor(attention_mask, dtype=torch.long) + token_type_ids_t = torch.tensor(token_type_ids, dtype=torch.long) + ids_t = torch.tensor(np.arange(len(token_type_ids)), dtype=torch.long) + + # wrap tensors into dataset + if label_ids is not None: + label_ids_t = torch.tensor(label_ids, dtype=torch.long) + dataset = TensorDataset(input_ids_t, attention_mask_t, + token_type_ids_t, label_ids_t, ids_t) + else: + dataset = TensorDataset(input_ids_t, attention_mask_t, + token_type_ids_t, ids_t) + + # wrap dataset into dataloader and return dataloader + if sampling == "random": + sampler = RandomSampler(dataset) + else: + sampler = SequentialSampler(dataset) + dataloader = DataLoader(dataset, sampler=sampler, batch_size=self.batch_size) + return dataloader + + + def _align_predictions(self, data, pred_ids): + """ Align internal predictions from model that are aligned to + wordpieces with external labels that are aligned to tokens. + + Parameters + ---------- + data : list(list(str)) + list of jagged list of input tokens. + pred_ids : list(list(long)) + list of same size list of prediction ids. + + Returns + ------- + predictions : list(list(str)) + list of list of predictions aligned to input tokens + and using same class names as input labels. + """ + data_a, preds_a = [], [] + for tokens, pred_tag_ids in zip(data, pred_ids): + tokens_x = [] + for token in tokens: + tokens_x.extend(self.tokenizer_.tokenize(token)) + tokens_r, preds_r = [], [] + for t, p in zip(tokens_x, pred_tag_ids): + if t in self.special_tokens_: + continue + if t.startswith("##"): + tokens_r[-1] = tokens_r[-1] + t[2:] + else: + tokens_r.append(t) + preds_r.append(self.id2label_[p]) + + if len(tokens_r) < len(tokens): + # pad any truncated sentences with [PAD]/O + num_pad_tokens = len(tokens) - len(tokens_r) + tokens_r.extend([self.tokenizer_.pad_token] * num_pad_tokens) + preds_r.extend(["O"] * num_pad_tokens) + + data_a.append(tokens_r) + preds_a.append(preds_r) + + return preds_a + diff --git a/nerds/test/test_bert_ner.py b/nerds/test/test_bert_ner.py new file mode 100644 index 0000000..a3133c5 --- /dev/null +++ b/nerds/test/test_bert_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import BertNER +from nerds.utils import load_data_and_labels + +import numpy as np +import shutil + + +def test_bert_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = BertNER(max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + shutil.rmtree("nerds/test/data/models") From c47d8ef5161c7450e6df294a057f0af1f49292e3 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 17 Jan 2020 12:27:17 -0800 Subject: [PATCH 54/64] fix default behavior to pad with padding tag in case of truncated tokens during prediction, with utility method to align for metric computations --- nerds/models/bert.py | 26 ++++++++++++++++++------- nerds/test/test_utils.py | 23 +++++++++++++++++++++- nerds/utils.py | 41 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 81 insertions(+), 9 deletions(-) diff --git a/nerds/models/bert.py b/nerds/models/bert.py index c70b3c6..bad3a94 100644 --- a/nerds/models/bert.py +++ b/nerds/models/bert.py @@ -29,6 +29,7 @@ def __init__(self, learning_rate=2e-5, batch_size=32, max_iter=4, + padding_tag="O", verbose=False, random_state=42): """ Construct a BERT NER model. Uses a pretrained BERT language model @@ -49,6 +50,8 @@ def __init__(self, batch size to use for training. max_iter : int, default 4 number of epochs to fine tune. + padding_tag : str, default "O" + tag to pad predictions with if len(tokens) > len(predicted_tags). verbose : bool, optional, default False whether to display log messages on console. random_state : int, optional, default 42 @@ -71,6 +74,7 @@ def __init__(self, self.learning_rate = learning_rate self.batch_size = batch_size self.max_iter = max_iter + self.padding_tag = padding_tag self.verbose = verbose self.random_state = random_state self._set_random_state(random_state) @@ -199,7 +203,13 @@ def fit(self, X, y): def predict(self, X): - """ Predicts using the NER model. + """ Predicts using the NER model. Note that because of the + way BERT re-tokenizes incoming tokens to word-pieces, it + is possible that some incoming tokens may not be presented + to the model for NER tagging, and hence the list of predicted + tags will padded with a pseudo-tag (default 'O'). If you chose + a different pseudo-tag, you will need to re-align labels and + predictions using nerds.utils.align_lists(). Parameters ---------- @@ -381,11 +391,13 @@ def _data_labels_to_features(self, data, labels): tokens_sent.extend(subwords) tags_sent.append(self.label2id_[tag]) if len(subwords) > 1: - # repeat tag for all subwords following the specified word, see - # https://github.com/google-research/bert/issues/646#issuecomment-519868110 - if tag.startswith("B-"): - tag = tag.replace("B-", "I-") - tags_sent.extend([self.label2id_[tag]] * (len(subwords) - 1)) + tags_sent.extend([self._pad_label_id] * (len(subwords) - 1)) + # if len(subwords) > 1: + # # repeat tag for all subwords following the specified word, see + # # https://github.com/google-research/bert/issues/646#issuecomment-519868110 + # if tag.startswith("B-"): + # tag = tag.replace("B-", "I-") + # tags_sent.extend([self.label2id_[tag]] * (len(subwords) - 1)) # truncate to max_sequence_length - 2 (account for special tokens CLS and SEP) tokens_sent = tokens_sent[0:self.max_sequence_length - 2] @@ -497,7 +509,7 @@ def _align_predictions(self, data, pred_ids): # pad any truncated sentences with [PAD]/O num_pad_tokens = len(tokens) - len(tokens_r) tokens_r.extend([self.tokenizer_.pad_token] * num_pad_tokens) - preds_r.extend(["O"] * num_pad_tokens) + preds_r.extend([self.padding_tag] * num_pad_tokens) data_a.append(tokens_r) preds_a.append(preds_r) diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index aa74590..8075331 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.utils import * @@ -29,7 +32,7 @@ def test_write_param_file(): def test_flatten_and_unflatten_list(): X, y = load_data_and_labels("nerds/test/data/example.iob") - yflat = flatten_list(y, strip_prefix=True, capture_lengths=True) + yflat = flatten_list(y, strip_prefix=True) assert_equal(36, len(yflat), "There should be 36 tags in all") assert_equal(5, len([y for y in yflat if y == "PER"]), "There should be 5 PER tags") y_lengths = compute_list_lengths(y) @@ -98,3 +101,21 @@ def test_spans_to_tokens_no_multiword_spans(): for ref_pred, pred in zip(ref_preds, tags): assert_equal(ref_pred, pred, "Tags do not match. {:s} != {:s}".format(ref_pred, pred)) + +def align_lists_padded(): + labels = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] + preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'X', 'X', 'X', 'X'] + assert_equal(len(labels), len(preds)) + labels_a, preds_a = align_lists(labels, preds, padding_tag="X") + assert_equal(len(labels_a), len(preds_a)) + assert_equal(len(labels_a), len(labels) - 4) + + +def align_lists_unpadded(): + labels = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] + preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O'] + assert_true(len(labels) > len(preds)) + labels_a, preds_a = align_lists(labels, preds) + assert_equal(len(labels_a), len(preds_a)) + assert_equal(len(preds_a), len(preds)) + assert_equal(len(labels_a), len(labels) - 4) diff --git a/nerds/utils.py b/nerds/utils.py index 120d331..87c7684 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -61,7 +61,7 @@ def write_param_file(param_dict, param_filepath): fp.write(yaml.dump(param_dict)) -def flatten_list(xs, strip_prefix=True, capture_lengths=False): +def flatten_list(xs, strip_prefix=True): """ Flatten label or predictions from list(list(str)) to list(str). Flattened list can be input to scikit-learn's standard functions to compute various metrics. @@ -131,6 +131,45 @@ def unflatten_list(xs_flat, xs_lengths): return xs_unflat +def align_lists(labels, predictions, padding_tag=None): + """ Tokenizers paired with BERT-like transformer based NERs break up + tokens into word-pieces in order to minimize or eliminate [UNK] + situations. However, these word-pieces count to the max_sequence_length + specified in the NER, which may mean that predictions will have + fewer tags than labels because the last few tokens in the tokenized + input string have been cut off. This function will align the + label and prediction lists by removing these tags from the labels. + + Parameters + ---------- + labels : list(list(str)) + list of list of label tags + predictions : list(list(str)) + list of list of prediction tags + padding_tag : str, default None + special token (not part of label set) to denote padding tag. + + Returns + ------- + labels, predictions : labels list aligned to predictions + """ + if len(labels) != len(predictions): + raise ValueError("Number of tag lists (for sentences) in label and prediction must match.") + + labels_a, predictions_a = [], [] + for tags_l, tags_p in zip(labels, predictions): + if len(tags_l) != len(tags_p): + if padding_tag is not None: + tags_p = [t for t in tags_p if t != padding_tag] + labels_a.append(tags_l[0:len(tags_p)]) + predictions_a.append(tags_p) + else: + labels_a.append(tags_l) + predictions_a.append(tags_p) + + return labels_a, predictions_a + + def tokens_to_spans(tokens, tags, allow_multiword_spans=True): """ Convert from tokens-tags format to sentence-span format. Some NERs use the sentence-span format, so we need to transform back and forth. From 2a6cc00cd6bb9052cc3cc0cc9a8fbcf69ce2594b Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 17 Jan 2020 18:13:09 -0800 Subject: [PATCH 55/64] added suppress warnings to all tests, added new utils methods and tests. --- nerds/test/test_base_ner.py | 3 +++ nerds/test/test_bilstm_ner.py | 3 +++ nerds/test/test_crf_ner.py | 3 +++ nerds/test/test_dictionary_ner.py | 3 +++ nerds/test/test_elmo_ner.py | 3 +++ nerds/test/test_ensemble_ner.py | 3 +++ nerds/test/test_spacy_ner.py | 3 +++ nerds/test/test_utils.py | 44 +++++++++++++++++++------------ nerds/utils.py | 29 +++++++++++++++----- 9 files changed, 70 insertions(+), 24 deletions(-) diff --git a/nerds/test/test_base_ner.py b/nerds/test/test_base_ner.py index 6e06cff..f2014d0 100644 --- a/nerds/test/test_base_ner.py +++ b/nerds/test/test_base_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_raises from nerds.models import NERModel diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py index a945ba1..c9e4241 100644 --- a/nerds/test/test_bilstm_ner.py +++ b/nerds/test/test_bilstm_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import BiLstmCrfNER diff --git a/nerds/test/test_crf_ner.py b/nerds/test/test_crf_ner.py index f296d1f..770e35b 100644 --- a/nerds/test/test_crf_ner.py +++ b/nerds/test/test_crf_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import CrfNER diff --git a/nerds/test/test_dictionary_ner.py b/nerds/test/test_dictionary_ner.py index c4daa7b..094714f 100644 --- a/nerds/test/test_dictionary_ner.py +++ b/nerds/test/test_dictionary_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import DictionaryNER diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py index ee8a0b7..c0ae355 100644 --- a/nerds/test/test_elmo_ner.py +++ b/nerds/test/test_elmo_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import ElmoNER diff --git a/nerds/test/test_ensemble_ner.py b/nerds/test/test_ensemble_ner.py index 05ba593..9e51fad 100644 --- a/nerds/test/test_ensemble_ner.py +++ b/nerds/test/test_ensemble_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import DictionaryNER, CrfNER, SpacyNER, EnsembleNER diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py index 0d800ce..ea91464 100644 --- a/nerds/test/test_spacy_ner.py +++ b/nerds/test/test_spacy_ner.py @@ -1,3 +1,6 @@ +import warnings +warnings.filterwarnings("ignore") + from nose.tools import assert_equal, assert_true from nerds.models import SpacyNER diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py index 8075331..2a223ed 100644 --- a/nerds/test/test_utils.py +++ b/nerds/test/test_utils.py @@ -18,6 +18,14 @@ def test_load_data_and_labels(): assert_equal(len(X[0]), len(y[0]), "Number of tokens should be equal to number of tags") +def test_get_labels_from_data(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + raw_labels = get_labels_from_data(y) + assert_equal(8, len(raw_labels), "There should be 8 unique raw labels") + class_labels = get_labels_from_data(y, strip_prefix=True) + assert_equal(5, len(class_labels), "There should be 5 unique class labels") + + def test_write_param_file(): model = CrfNER() param_filepath = "nerds/test/data/crf_params.yaml" @@ -102,20 +110,22 @@ def test_spans_to_tokens_no_multiword_spans(): assert_equal(ref_pred, pred, "Tags do not match. {:s} != {:s}".format(ref_pred, pred)) -def align_lists_padded(): - labels = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] - preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'X', 'X', 'X', 'X'] - assert_equal(len(labels), len(preds)) - labels_a, preds_a = align_lists(labels, preds, padding_tag="X") - assert_equal(len(labels_a), len(preds_a)) - assert_equal(len(labels_a), len(labels) - 4) - - -def align_lists_unpadded(): - labels = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] - preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O'] - assert_true(len(labels) > len(preds)) - labels_a, preds_a = align_lists(labels, preds) - assert_equal(len(labels_a), len(preds_a)) - assert_equal(len(preds_a), len(preds)) - assert_equal(len(labels_a), len(labels) - 4) +def test_align_labels_and_predictions_with_padding(): + labels = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O']] + preds = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'X', 'X', 'X', 'X']] + assert_equal(len(labels[0]), len(preds[0]), "Label and Prediction should have same number of tags") + labels_a, preds_a = align_labels_and_predictions(labels, preds, padding_tag="X") + print(">>>>", len(labels[0]), len(preds[0]), len(labels_a[0]), len(preds_a[0])) + assert_equal(len(labels_a[0]), len(preds_a[0]), "After padded alignment, Label and Prediction should have same number of tags") + assert_equal(len(labels_a[0]), len(labels[0]) - 4, "After padded alignment, labels should be shorter than before.") + assert_equal(len(preds_a[0]), len(preds[0]) - 4, "After padded alignment, predictions should be shorter than before.") + + +def test_align_labels_and_predictions_without_padding(): + labels = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O']] + preds = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O']] + assert_true(len(labels[0]) > len(preds[0]), "Label and Prediction should have same number of tags") + labels_a, preds_a = align_labels_and_predictions(labels, preds) + assert_equal(len(labels_a[0]), len(preds_a[0]), "After unpadded alignment, Label and Prediction should have same number of tags") + assert_equal(len(preds_a[0]), len(preds[0]), "After unpadded alignment, number of prediction tags should be unchanged.") + assert_equal(len(labels_a[0]), len(labels[0]) - 4, "After unpadded alignment, labels should be shorter.") diff --git a/nerds/utils.py b/nerds/utils.py index 87c7684..7f4c302 100644 --- a/nerds/utils.py +++ b/nerds/utils.py @@ -39,6 +39,14 @@ def load_data_and_labels(filepath, encoding="utf-8"): return anago.utils.load_data_and_labels(filepath, encoding) +def get_labels_from_data(labels, strip_prefix=False): + unique_labels = list(set([tag for tags in labels for tag in tags])) + if strip_prefix: + unique_labels = list(set([t.split("-")[1] + if t != "O" else t for t in unique_labels])) + return sorted(unique_labels) + + def write_param_file(param_dict, param_filepath): """ Write configured model hyperparameters to file for documentation. @@ -131,7 +139,7 @@ def unflatten_list(xs_flat, xs_lengths): return xs_unflat -def align_lists(labels, predictions, padding_tag=None): +def align_labels_and_predictions(labels, predictions, padding_tag=None): """ Tokenizers paired with BERT-like transformer based NERs break up tokens into word-pieces in order to minimize or eliminate [UNK] situations. However, these word-pieces count to the max_sequence_length @@ -158,14 +166,21 @@ def align_lists(labels, predictions, padding_tag=None): labels_a, predictions_a = [], [] for tags_l, tags_p in zip(labels, predictions): - if len(tags_l) != len(tags_p): - if padding_tag is not None: - tags_p = [t for t in tags_p if t != padding_tag] - labels_a.append(tags_l[0:len(tags_p)]) - predictions_a.append(tags_p) - else: + if padding_tag is not None: + assert(len(tags_l) == len(tags_p)) + tags_lp = [(l, p) for l, p in zip(tags_l, tags_p) + if p != padding_tag] + tags_l = [l for (l, p) in tags_lp] + tags_p = [p for (l, p) in tags_lp] labels_a.append(tags_l) predictions_a.append(tags_p) + else: + if len(tags_l) != len(tags_p): + labels_a.append(tags_l[0:len(tags_p)]) + predictions_a.append(tags_p) + else: + labels_a.append(tags_l) + predictions_a.append(tags_p) return labels_a, predictions_a From a94f1699eee86ceb244526bc51b1454e8a54ac22 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 17 Jan 2020 18:15:07 -0800 Subject: [PATCH 56/64] initial revision for simpletransformer based NER --- nerds/models/__init__.py | 9 +- nerds/models/transformer.py | 269 +++++++++++++++++++++++++++++ nerds/test/test_transformer_ner.py | 22 +++ setup.py | 4 +- 4 files changed, 300 insertions(+), 4 deletions(-) create mode 100644 nerds/models/transformer.py create mode 100644 nerds/test/test_transformer_ner.py diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py index 4388d2f..ed06df1 100644 --- a/nerds/models/__init__.py +++ b/nerds/models/__init__.py @@ -1,12 +1,14 @@ from nerds.models.base import NERModel + +from nerds.models.bert import BertNER from nerds.models.bilstm import BiLstmCrfNER from nerds.models.crf import CrfNER -from nerds.models.spacy import SpacyNER from nerds.models.dictionary import DictionaryNER from nerds.models.elmo import ElmoNER -from nerds.models.flair import FlairNER -from nerds.models.bert import BertNER from nerds.models.ensemble import EnsembleNER +from nerds.models.flair import FlairNER +from nerds.models.spacy import SpacyNER +from nerds.models.transformer import TransformerNER __all__ = [ "NERModel", @@ -17,5 +19,6 @@ "ElmoNER", "FlairNER", "BertNER", + "TransformerNER", "EnsembleNER" ] diff --git a/nerds/models/transformer.py b/nerds/models/transformer.py new file mode 100644 index 0000000..26e2572 --- /dev/null +++ b/nerds/models/transformer.py @@ -0,0 +1,269 @@ +import joblib +import nerds +import os +import pandas as pd +import random +import torch + +from simpletransformers.ner.ner_model import NERModel as ST_NERModel + +from nerds.models import NERModel +from nerds.utils import (flatten_list, get_logger, + write_param_file, get_labels_from_data) + +from sklearn.model_selection import train_test_split + +log = get_logger() + +class TransformerNER(NERModel): + + def __init__(self, + lang_model_family="bert", + lang_model_name="bert-base-cased", + model_dir="models", + max_sequence_length=128, + batch_size=32, + max_iter=4, + learning_rate=4e-5, + padding_tag="O", + random_state=42): + """ Construct a Transformer NER model. This is a generic front-end + NER class that can work with multiple Transformer architectures. + + Parameters + ---------- + model_dir : str, optional, default "./models" + the directory to which model artifacts will be written out to. + lang_model_family : str, optional, default "bert" + the Transformer Language Model (LM) Family to use. Following LM + families are supported - BERT, RoBERTa, DistilBERT, CamemBERT, + and XLM-RoBERTa. + lang_model_name : str, optional, default "bert-base-cased" + name of the pre-trained LM to use. + model_dir : string, optional, default "models" + directory path to folder where model artifacts will be written + max_sequence_length : int, optional, default 128 + maximum number of tokens in each input sentence. Note that + because of word-piece tokenization, this is not the actual + number of tokens, but the number of word-pieces. + batch_size : int, optional, default 32 + the batch size to use during training and prediction. + max_iter : int, optional, default 4 + the number of epochs to train the model. + learning_rate: float, optional, default 4e-5 + learning rate for Adam optimizer. + padding_tag : str, default "O" + padding tag to use when number of predicted tags is smaller + than the number of label tags because of word-piece tokenization. + Default value ensures that you won't have to align, at the cost + of a drop in reported performance. You should choose a non-default + value and align using nerds.utils.align_labels_and_predictions(). + random_state : int, optional, default 42 + random state to set. + + Attributes + ---------- + model_ : reference to the SimpleTranformers NERModel object. + model_args_ : flat dictionary composed of values from constructor. + labels_ : list of labels to use in model. + """ + super().__init__() + self.model_dir = model_dir + self.lang_model_family = lang_model_family + self.lang_model_name = lang_model_name + self.max_sequence_length = max_sequence_length + self.batch_size = batch_size + self.max_iter = max_iter + self.learning_rate = learning_rate + self.padding_tag = padding_tag + self.random_state = random_state + # attributes + self.model_ = None + self.model_args_ = None + self.labels_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of list of tokens and tags. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + self._build_model_args() + self.labels_ = get_labels_from_data(y) + self.model_ = ST_NERModel( + self.lang_model_family, + self.lang_model_name, + labels=self.labels_, + use_cuda=torch.cuda.is_available(), + args=self.model_args_) + + os.makedirs(self.model_dir, exist_ok=True) + + Xtrain, Xval, ytrain, yval = train_test_split(X, y, + test_size=0.1, random_state=self.random_state) + train_df = self._build_dataframe_from_data_labels(Xtrain, ytrain) + eval_df = self._build_dataframe_from_data_labels(Xval, yval) + self.model_.train_model(train_df, eval_df=eval_df) + return self + + + def predict(self, X): + """ Predicts using the NER model + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("No model found, either run fit() to train or load() to load a trained model.") + + predictions = [] + num_batches = (len(X) // self.batch_size) + 1 + for bid in range(num_batches): + b_start = bid * self.batch_size + b_end = min(b_start + self.batch_size, len(X)) + b_data = X[b_start : b_end] + b_preds, _ = self.model_.predict([" ".join(toks) for toks in b_data]) + # predictions are list of {token:tag} dicts + for i, b_pred in enumerate(b_preds): + prediction = flatten_list( + [[v for k, v in d.items()] for d in b_pred], + strip_prefix=False) + if len(prediction) < len(b_data[i]): + prediction.extend( + [self.padding_tag] * (len(b_data[i]) - len(prediction))) + predictions.append(prediction) + + return predictions + + + def save(self, dirpath=None): + """ This is a no-op for this NER, model artifacts are saved automatically + after every epoch. + + Parameters + ---------- + dirpath : str, optional + directory to which the param file will be written. If not + specified, it will use the folder specified by the model's + output_dir. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() pretrained model.") + if dirpath is None: + self._build_model_args() + dirpath = self.model_args_["output_dir"] + attr_dict = { + "model_args": self.model_args_, + "labels": self.labels_ + } + joblib.dump(attr_dict, os.path.join(dirpath, "attr_dict.pkl")) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath=None): + """ Loads a trained model from specified folder on disk. + + Parameters + ---------- + dirpath : str, optional + directory from which model artifacts should be loaded. If + not provided, uses the model_args_["output_dir]. + + Returns + ------- + self + """ + if dirpath is None: + self._build_model_args() + dirpath = self.model_args_["output_dir"] + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + attr_dict = joblib.load(os.path.join(dirpath, "attr_dict.pkl")) + self.model_args_ = attr_dict["model_args"] + self.labels_ = attr_dict["labels"] + self.model_ = ST_NERModel(self.lang_model_family, dirpath, + args=self.model_args_, + labels=self.labels_, + use_cuda=torch.cuda.is_available()) + return self + + + def _build_model_args(self): + """ Builds the model_arg dictionary from constructor parameters. + + Parameters + ---------- + none + + Returns + ------- + none + """ + self.model_args_ = { + "output_dir": os.path.join(self.model_dir, "outputs"), + "cache_dir": os.path.join(self.model_dir, "cache"), + "fp16": False, + "fp16_opt_level": "01", + "max_seq_length": self.max_sequence_length, + "train_batch_size": self.batch_size, + "gradient_accumulation_steps": 1, + "num_train_epochs": self.max_iter, + "weight_decay": 0, + "learning_rate": self.learning_rate, + "adam_epsilon": 1e-8, + "warmup_ratio": 0.06, + "warmup_steps": 0, + "max_grad_norm": 1.0, + "eval_batch_size": self.batch_size, + "logging_steps": 50, + "save_steps": 2000, + "overwrite_output_dir": True, + "reprocess_input_data": True, + "evaluate_during_training": True, + "process_count": os.cpu_count() - 2 if os.cpu_count() > 2 else 1, + "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0 + } + + + def _build_dataframe_from_data_labels(self, data, labels): + """ Builds Pandas dataframe from data and labels. + + Parameters + ---------- + data : list(list(str)) + list of list of tokens + labels : list(list(str)) + list of list of tags + + Returns + ------- + Pandas DataFrame with columns (sentence_id, words, labels). + """ + columns = ["sentence_id", "words", "labels"] + recs = [] + for sid, (tokens, tags) in enumerate(zip(data, labels)): + for token, tag in zip(tokens, tags): + recs.append((sid, token, tag)) + data_df = pd.DataFrame.from_records(recs, columns=columns) + return data_df + diff --git a/nerds/test/test_transformer_ner.py b/nerds/test/test_transformer_ner.py new file mode 100644 index 0000000..1d46d3f --- /dev/null +++ b/nerds/test/test_transformer_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import TransformerNER +from nerds.utils import load_data_and_labels + +import numpy as np +import shutil + + +def test_bert_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = TransformerNER(model_dir="nerds/test/data/models", max_iter=1) + model.fit(X, y) + model.save() + model_r = model.load() + y_pred = model_r.predict(X) + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + # shutil.rmtree("nerds/test/data/models") diff --git a/setup.py b/setup.py index 4d5ba23..9c55883 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,9 @@ 'spacy', 'tensorflow', 'torch', - 'transformers' + 'transformers', + 'pandas', + 'simpletransformers' ], tests_require=[ 'coverage', From 22caa4c4b0cd35d55ca88e53ac2033b2d08702ee Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 17 Jan 2020 21:25:21 -0800 Subject: [PATCH 57/64] adding example runs --- examples/BioNLP/test_models.py | 23 +++++++++++++++++++---- examples/GMB/test_models.py | 21 ++++++++++++++++++--- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py index 62d54af..3d6be2a 100644 --- a/examples/BioNLP/test_models.py +++ b/examples/BioNLP/test_models.py @@ -7,7 +7,8 @@ from nerds.models import ( DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, - ElmoNER, FlairNER, BertNER, EnsembleNER + ElmoNER, FlairNER, BertNER, TransformerNER, + EnsembleNER ) from nerds.utils import * @@ -85,11 +86,25 @@ labels=entity_labels)) # train and test the BERT NER -model = BertNER(max_sequence_length=256) +model = BertNER(padding_tag="X") model.fit(xtrain, ytrain) -model.save("models/bert_ner") -trained_model = model.load("models/bert_ner") +model.save("models/bert_model") +trained_model = model.load("models/bert_model") ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the Transformers NER +model = TransformerNER( + model_dir="models/transformer_model", + padding_tag="X") +model.fit(xtrain, ytrain) +model.save() +trained_model = model.load() +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") print(classification_report(flatten_list(ytest, strip_prefix=True), flatten_list(ypred, strip_prefix=True), labels=entity_labels)) diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py index 6fe36f5..c1a583e 100644 --- a/examples/GMB/test_models.py +++ b/examples/GMB/test_models.py @@ -8,7 +8,8 @@ from nerds.models import ( DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, - ElmoNER, FlairNER, BertNER, EnsembleNER + ElmoNER, FlairNER, BertNER, TransformerNER, + EnsembleNER ) from nerds.utils import * @@ -45,7 +46,7 @@ def convert_to_iob_format(input_file, output_file): os.makedirs("models") # read IOB file -data, labels = load_data_and_labels("train.iob") +data, labels = load_data_and_labels("train.iob", encoding="iso-8859-1") # optional: restrict dataset to 5000 sentences # data_s, labels_s = shuffle(data, labels, random_state=42) # data = data_s @@ -119,11 +120,25 @@ def convert_to_iob_format(input_file, output_file): labels=entity_labels)) # train and test the BERT NER -model = BertNER() +model = BertNER(padding_tag="X") model.fit(xtrain, ytrain) model.save("models/bert_model") trained_model = model.load("models/bert_model") ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test Transformer NER +model = TransformerNER( + model_dir="models/transformer_model", + padding_tag="X") +model.fit(xtrain, ytrain) +model.save() +trained_model = model.load() +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") print(classification_report(flatten_list(ytest, strip_prefix=True), flatten_list(ypred, strip_prefix=True), labels=entity_labels)) From 35ef8def2a221f49a0b7b89e909dccac058a025e Mon Sep 17 00:00:00 2001 From: sujitpal Date: Fri, 17 Jan 2020 21:27:20 -0800 Subject: [PATCH 58/64] uncommenting clean up --- nerds/test/test_transformer_ner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerds/test/test_transformer_ner.py b/nerds/test/test_transformer_ner.py index 1d46d3f..493d3e9 100644 --- a/nerds/test/test_transformer_ner.py +++ b/nerds/test/test_transformer_ner.py @@ -19,4 +19,4 @@ def test_bert_ner(): y_pred = model_r.predict(X) assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal") assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") - # shutil.rmtree("nerds/test/data/models") + shutil.rmtree("nerds/test/data/models") From c9f5578a01bf39a1ffa2718641f74a8838d8dc76 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Sat, 18 Jan 2020 13:43:17 -0800 Subject: [PATCH 59/64] adding classification report for TransformerNER --- examples/BioNLP/README.md | 17 +++++++++++++++++ examples/GMB/README.md | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/examples/BioNLP/README.md b/examples/BioNLP/README.md index aa32330..ac4d7f3 100644 --- a/examples/BioNLP/README.md +++ b/examples/BioNLP/README.md @@ -119,6 +119,23 @@ weighted avg 0.80 0.73 0.76 19392 ``` +### Transformer NER (lang_model_family="bert", lang_model_name="bert-base-cased", max_sequence_length=128, batch_size=32, max_iter=4, learning_rate=4e-5, padding_tag="O", random_state=42) + +``` + precision recall f1-score support + + cell_line 0.80 0.60 0.68 1977 + cell_type 0.75 0.89 0.81 4161 + protein 0.88 0.81 0.84 10700 + DNA 0.84 0.82 0.83 2912 + RNA 0.85 0.79 0.82 325 + + micro avg 0.83 0.81 0.82 20075 + macro avg 0.82 0.78 0.80 20075 +weighted avg 0.84 0.81 0.82 20075 + +``` + ### Majority voting ensemble (pre-trained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` diff --git a/examples/GMB/README.md b/examples/GMB/README.md index 0b6f54d..3441236 100644 --- a/examples/GMB/README.md +++ b/examples/GMB/README.md @@ -144,6 +144,26 @@ weighted avg 0.86 0.84 0.85 48418 ``` +### Transformer NER (lang_model_family="bert", lang_model_name="bert-base-cased", max_sequence_length=128, batch_size=32, max_iter=4, learning_rate=4e-5, padding_tag="O", random_state=42) + +``` + precision recall f1-score support + + art 0.11 0.24 0.15 97 + eve 0.41 0.55 0.47 126 + geo 0.90 0.88 0.89 14016 + gpe 0.94 0.96 0.95 4724 + nat 0.34 0.80 0.48 40 + org 0.80 0.81 0.81 10669 + per 0.91 0.90 0.90 10402 + tim 0.89 0.93 0.91 7739 + + micro avg 0.87 0.88 0.88 47813 + macro avg 0.66 0.76 0.69 47813 +weighted avg 0.88 0.88 0.88 47813 + +``` + ### Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) ``` From 58c556cbd15bcc0564b5b69f6e53f23308b56385 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 21 Jan 2020 12:04:28 -0800 Subject: [PATCH 60/64] updated change list to include TransformerNER --- docs/CHANGES.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/CHANGES.md b/docs/CHANGES.md index 006094f..255dc5d 100644 --- a/docs/CHANGES.md +++ b/docs/CHANGES.md @@ -27,8 +27,10 @@ * does not give timestep size errors * ElmoNER * New, available in Anago DEV repo, same API as Anago's BiLSTMCRF - * FLAIR based NER + * FlairNER * New, incorporated from the [Zalando Flair project](https://github.com/flairNLP/flair). + * TransformerNER + * New, provides support for transformer based NERs using choice of BERT, RoBERTa, DistilBERT, CamemBERT, and XLM-RoBERTa language models, uses the [SimpleTransformers library](https://pypi.org/project/simpletransformers/). * EnsembleNER * simpler interface * weights from each classifier @@ -46,5 +48,4 @@ ## Planned -* BERT Transformer based NER From 607bde5d0a17f3cd75943d93566b680945640dd6 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 21 Jan 2020 12:06:44 -0800 Subject: [PATCH 61/64] adding TransformerNER blog post link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3828e36..befd880 100644 --- a/README.md +++ b/README.md @@ -136,3 +136,4 @@ The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that w * \[slides\] [Slides for talk at PyData LA 2019](https://www.slideshare.net/sujitpal/building-named-entity-recognition-models-efficiently-using-nerds). * \[video\] [Video of talk at PyData LA 2019](https://www.youtube.com/watch?v=ilzFiK0nAh8). * \[blog\] [Incorporating third party NER (Flair) into NERDS](https://sujitpal.blogspot.com/2019/12/incorporating-flair-ner-into-nerds.html). +* \[blog\] [Adding a Transformer based NER model into NERDS](https://sujitpal.blogspot.com/2020/01/adding-transformer-based-ner-model-into.html). From 514ac22fcdf71b96484957445097af613c472594 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 21 Jan 2020 12:23:59 -0800 Subject: [PATCH 62/64] updated preprocessing logic and marked it experimental --- nerds/models/bert.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/nerds/models/bert.py b/nerds/models/bert.py index bad3a94..56aa3f9 100644 --- a/nerds/models/bert.py +++ b/nerds/models/bert.py @@ -36,6 +36,12 @@ def __init__(self, and a Fine tuning model for NER is provided by the HuggingFace transformers library. + NOTE: this is an experimental NER that did not perform very well, and + is only here for reference purposes. It has been superseded by the + TransformerNER model, which offers the same functionality (and improved + performance) not only with BERT as the underlying language model (as this + one does), but allows other BERT-like language model backends as well. + Parameters ---------- lang_model : str, optional, default "bert-base-cased" @@ -403,28 +409,27 @@ def _data_labels_to_features(self, data, labels): tokens_sent = tokens_sent[0:self.max_sequence_length - 2] tags_sent = tags_sent[0:self.max_sequence_length - 2] + # prepend [CLS] and append [SEP] + tokens_sent = [self.tokenizer_.cls_token] + tokens_sent + [self.tokenizer_.sep_token] + tags_sent = [self._pad_label_id] + tags_sent + [self._pad_label_id] + # pad upto the max_sequence_length - 2 (account for special tokens CLS and SEP) - tokens_to_pad = self.max_sequence_length - (len(tokens_sent) + 2) + tokens_to_pad = self.max_sequence_length - len(tokens_sent) tokens_sent.extend([self.tokenizer_.pad_token] * tokens_to_pad) tags_sent.extend([self._pad_label_id] * tokens_to_pad) - # add in [CLS] and [SEP] for tags_sent and tokens_sent - input_tokens = self.tokenizer_.build_inputs_with_special_tokens(tokens_sent) - input_tags = [self._pad_label_id] + tags_sent + [self._pad_label_id] - # feature: input_ids - input_ids.append(self.tokenizer_.convert_tokens_to_ids(input_tokens)) + input_ids.append(self.tokenizer_.convert_tokens_to_ids(tokens_sent)) # feature: attention_mask - attention_mask.append([0 if t == self.tokenizer_.pad_token else 1 for t in input_tokens]) + attention_mask.append([0 if t == self.tokenizer_.pad_token else 1 for t in tokens_sent]) # feature: token_type_ids - token_type_ids.append( - self.tokenizer_.create_token_type_ids_from_sequences(tokens_sent)) + token_type_ids.append([0] * self.max_sequence_length) # feature: label_ids - label_ids.append(input_tags) + label_ids.append(tags_sent) if self.verbose and i < 5: log.info("row[{:d}].features:".format(i)) - log.info(" input_tokens:", input_tokens) + log.info(" input_tokens:", tokens_sent) log.info(" input_ids:", input_ids[i]) log.info(" attention_mask:", attention_mask[i]) log.info(" token_type_ids:", token_type_ids[i]) From 951fdc406a96e2cc87580b7d42084df2d949ca94 Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 21 Jan 2020 15:11:41 -0800 Subject: [PATCH 63/64] predict does its own batching, more efficient without external batching --- nerds/models/transformer.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/nerds/models/transformer.py b/nerds/models/transformer.py index 26e2572..4b8a668 100644 --- a/nerds/models/transformer.py +++ b/nerds/models/transformer.py @@ -132,24 +132,20 @@ def predict(self, X): if self.model_ is None: raise ValueError("No model found, either run fit() to train or load() to load a trained model.") - predictions = [] - num_batches = (len(X) // self.batch_size) + 1 - for bid in range(num_batches): - b_start = bid * self.batch_size - b_end = min(b_start + self.batch_size, len(X)) - b_data = X[b_start : b_end] - b_preds, _ = self.model_.predict([" ".join(toks) for toks in b_data]) - # predictions are list of {token:tag} dicts - for i, b_pred in enumerate(b_preds): - prediction = flatten_list( - [[v for k, v in d.items()] for d in b_pred], - strip_prefix=False) - if len(prediction) < len(b_data[i]): - prediction.extend( - [self.padding_tag] * (len(b_data[i]) - len(prediction))) - predictions.append(prediction) - - return predictions + predictions, _ = self.model_.predict([" ".join(toks) for toks in X]) + # predictions are list of {token:tag} dicts + predictions = [[tag for token_tag_dict in prediction + for (token, tag) in token_tag_dict.items()] + for prediction in predictions] + # handle possible truncation of prediction (and subsequent mismatch + # with labels) because of too long token list. + predictions_a = [] + for prediction, tokens in zip(predictions, X): + if len(prediction) < len(tokens): + prediction.extend( + [self.padding_tag] * (len(tokens) - len(prediction))) + predictions_a.append(prediction) + return predictions_a def save(self, dirpath=None): From 366420b2ec57bf790562de62a79f4973cbd6b3ed Mon Sep 17 00:00:00 2001 From: sujitpal Date: Tue, 28 Jan 2020 17:10:15 -0800 Subject: [PATCH 64/64] Updated Pipfile based on security vulnerability, removing Pipfile.lock since no longer using. --- Pipfile | 2 +- Pipfile.lock | 1335 -------------------------------------------------- 2 files changed, 1 insertion(+), 1336 deletions(-) delete mode 100644 Pipfile.lock diff --git a/Pipfile b/Pipfile index 7de5051..66a9531 100644 --- a/Pipfile +++ b/Pipfile @@ -20,7 +20,7 @@ scipy = "*" sklearn = "*" sklearn-crfsuite = "*" spacy = "==2.0.11" -tensorflow = "*" +tensorflow >= "1.15.2" nose = "*" coverage = "*" "flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 171961b..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,1335 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "19fca82c6035492ccb511569ec10d7677d93da24ed0a12958dfd02a00636b8dd" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.6" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "absl-py": { - "hashes": [ - "sha256:e0eb8358b549552b1cc5972350bc3e41dd0a926c15b3ff95ce60f3c78c80824c" - ], - "version": "==0.2.2" - }, - "anago": { - "hashes": [ - "sha256:a4bd7b0d6109408fbdd9cdd2d6bfb60221bd7293c0645a75e6fddddce40abcc1" - ], - "index": "pypi", - "version": "==1.0.6" - }, - "astor": { - "hashes": [ - "sha256:64c805f1ad6fbc505633416b6174fc23796eb164f371a7dc1f3951ea30560fb5", - "sha256:ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d" - ], - "version": "==0.6.2" - }, - "bleach": { - "hashes": [ - "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65", - "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d" - ], - "version": "==1.5.0" - }, - "certifi": { - "hashes": [ - "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7", - "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0" - ], - "version": "==2018.4.16" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "coverage": { - "hashes": [ - "sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba", - "sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed", - "sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640", - "sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162", - "sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508", - "sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249", - "sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694", - "sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a", - "sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287", - "sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1", - "sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000", - "sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1", - "sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e", - "sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5", - "sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062", - "sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba", - "sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc", - "sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc", - "sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99", - "sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653", - "sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c", - "sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558", - "sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f", - "sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9", - "sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd", - "sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d", - "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6", - "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80" - ], - "index": "pypi", - "version": "==4.5.1" - }, - "cymem": { - "hashes": [ - "sha256:00bb3645dfb9a020d735ba3d6f822b04656388180588d8b2cebde967ee678bcc", - "sha256:0dd61d05977839a922c0d797c355b98949210575918b1743b41e38ae9fb2c3a7", - "sha256:4bc1056b52d959fcbb1e0f32ec84fa131754d6be1e36b65782c6ac86419f4bf3", - "sha256:4c5d9ca6ec706792b8d9b1faf6db77b95545c388c768b21d940f197aa7efbb7e", - "sha256:50292f4dd0d950a8698bae27d71efe59da7ff08e591b735e08b658aae42c4745", - "sha256:616d06333f46dd03c128d97912d361183fc02249e6420a7b7907b41214c51562", - "sha256:944af97d4d34a2470b5199f1c31d2dfc79cdec7bd7a41354d839a8ab87fdfaa6", - "sha256:b38056efb99078b06c504adb5f03a8d9e822a5543451737b746028a71c4b1ac3", - "sha256:b6513b2926c60d641f159e79e6fb16460dfb50ebcce31a5af0370c51837c7efc", - "sha256:daa6003fcc199752ab703142021cff74774872a932303b240dc0ea177adf295d", - "sha256:f06d9b50da0474d7405674d8101c319d89a17d33792d6d429fe3d5c64f0d9df1" - ], - "version": "==1.31.2" - }, - "cytoolz": { - "hashes": [ - "sha256:476a2ad176de5eaef80499b7b43d4f72ba6d23df33d349088dae315e9b31c552" - ], - "version": "==0.8.2" - }, - "decorator": { - "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" - ], - "version": "==4.3.0" - }, - "dill": { - "hashes": [ - "sha256:624dc244b94371bb2d6e7f40084228a2edfff02373fe20e018bef1ee92fdd5b3" - ], - "version": "==0.2.8.2" - }, - "flake8": { - "hashes": [ - "sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0", - "sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37" - ], - "index": "pypi", - "version": "==3.5.0" - }, - "future": { - "hashes": [ - "sha256:e39ced1ab767b5936646cedba8bcce582398233d6a627067d4c6a454c90cfedb" - ], - "index": "pypi", - "version": "==0.16.0" - }, - "gast": { - "hashes": [ - "sha256:7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930" - ], - "version": "==0.2.0" - }, - "grpcio": { - "hashes": [ - "sha256:0feade5de967be3c9ee041662d1347fc537ad05ccbcf05bcf1efa05072bef926", - "sha256:1ae02a9787cf2c5f25add0806f6271283b6074ab8619077d2b5c9037950c890b", - "sha256:2d7215dca11ba4aa49cc6c05b37e4b0a0f99727c8604e8ccd5ef1f6e06332200", - "sha256:35a4f6ffae88ce6a461e503ae91b62dc5c96013cafc717f2d7139686b5c39969", - "sha256:3be7635b4308e06449b2275a5e96a030bbf82ba6797ae8947f14667491924d81", - "sha256:3d1b3e7042a41b167334f718842f13deb80287886c9160efe31252602b13a128", - "sha256:468d4ce007cb859d5f9440cf4a7461cc172fd07d690300f4db88afaa78f01003", - "sha256:59a2fb52d286a38b9cbc7434eb473026fde0b20c223a10a99f5c3d4e395c2c2b", - "sha256:59c7670c902acce952ba709d9126cda87a45d7fed6bd568868e74171e4acd7f7", - "sha256:5b03fd3941c5e1a5deb01026bae025d319b38d3facb3e5fc491bca73e908d69e", - "sha256:65842e698776f4e49f62346c0f80fc31b34907e0df4247650c643113ef167122", - "sha256:68dbe71f890475e2824afbc5dc72714d1fca668bc15df0954bda4a8a5a53d0c7", - "sha256:868973b64b7e2464e5297cc660da588c542c175e85f6d2f7490d86c0dd5dbb4c", - "sha256:86f0c2062fde76789f7cdbf67d4ede116e7e1ceaf4c327fff7b9d17eb5852403", - "sha256:9444863aaba55b662719e22680f11134182604619f241cc607020e5b3786f4cd", - "sha256:9ac704e25d271af62c1ea72f1cb42ec7938f26f00314a8f324999ac5e1bf55eb", - "sha256:a02ef0354fb455a9ce2ad869a40f28f20a64147d46557c59b7269a15832c36d2", - "sha256:aabcdc960633231f9575252c061b480fc56a1ff6dcc7999fa5d4968f574d894f", - "sha256:b47a19a3be2f9608b4296bd16374c9a922d3206cf0a917792801a5cef5a2fa23", - "sha256:bf7bfe162057e6f1e3f4613b2a5f1157c8e286bddeaa40f7b8ce5054cb4b1413", - "sha256:c3cf3f431b41c39aa1501458d0e46086e699836536af873fe028dda1dfc6bcbd", - "sha256:cc7bd47eca988831d58a618908c825204d6ee8e90cdb9854a09b52a3b76ac168", - "sha256:cde83440fb4691d1bd8620ea919a9bd3199e6725e72d2c0d94898a2774c255ee", - "sha256:e1a03666852b956f7949c2a7f187dd54406cae2874c2ce26c1a0dafddf812cb2", - "sha256:e322eb5dc533cfbf21a9e964ebab80da391a26234a82288bfce505058913dfac", - "sha256:f0169d98670ef1db52e4f6930fd470c34731948350cabbe93087a8462b1f1da4", - "sha256:f14faadfd09aa8526536cd2149e274563f45b767fca1736ccc53803a6af3f90e" - ], - "version": "==1.12.1" - }, - "h5py": { - "hashes": [ - "sha256:0f8cd2acbacf3177b4427ed42639c911667b1f24d923388ab1f8ad466a12be5e", - "sha256:11277e3879098f921ee9e29105b20591e1dfdd44963357399f2abaa1a280c560", - "sha256:1241dec0c94ac32f3285cac1d6f44beabf80423e422ab03bd2686d731a8a9294", - "sha256:17b8187de0b3a945d8e8d031e7eb6ece2fce90791f9c5fde36f4396bf38fdde1", - "sha256:308e0758587ee16d4e73e7f2f8aae8351091e343bf0a43d2f697f9535465c816", - "sha256:37cacddf0e8209905f52537a8cf71da0dd9a4de62bd79247274c97b24a408997", - "sha256:38a23bb599748adf23d77f74885c0de6f4a7d9baa42f74e476bbf90fba2b47dd", - "sha256:47ab18b7b7bbc36fd2b606289b703b6f0ee915b923d6ad94dd17ac80ebffc280", - "sha256:486c78330af0bf33f5077b51d1888c0739c3cd1a03d5aade0d48572b3b5690ca", - "sha256:4e2183458d6ef1ae87dfb5d6acd0786359336cd9ac0ece6396c09b59fdaa3bd6", - "sha256:51d0595c3e58814c831f6cd2b664a5bf9590e26262c1d541b380d041e4fcb3c0", - "sha256:56d259d56822b70881760b243957f04a0cf133f0ec65eae6a33f562826aee899", - "sha256:5e6e777653169a3cc24ea56bb3d8c845ea391f8914c35bb6f350b0753a52891c", - "sha256:62bfb0ebb0f59e5dccc0b0dbbc0fc40dd1d1e09d04c0dc71f89790231531d4a2", - "sha256:67d89b64debfa021b54aa6f24bbf008403bd144748a0148596b518bce80d2fc4", - "sha256:9214ca445c18a37bfe9c165982c0e317e2f21f035c8d635d1c6d9fcbaf35b7a8", - "sha256:ab0c52850428d2e86029935389379c2c97f752e76b616da851deec8a4484f8ec", - "sha256:b2eff336697d8dfd712c5d93fef9f4e4d3e97d9d8c258801836b8664a239e07a", - "sha256:bb33fabc0b8f3fe3bb0f8d6821b2fad5b2a64c27a0808e8d1c5c1e3362062064", - "sha256:bd5353ab342bae1262b04745934cc1565df4cbc8d6a979a0c98f42209bd5c265", - "sha256:c45650de228ace7731e4280e14fb687f6d5c29cd666c5b22b42492b035e994d6", - "sha256:d5c0c01da45f901a3d429e7ef9e7e22baa869e1affb8715f1bf94e6a30020740", - "sha256:d75035db5bde802a29f4f29f18bb7548863d29ac90ccbf2c04c11799bbbba2c3", - "sha256:dda88206dc9464923f27f601000bc5b152ac0bd6d0122f098d4f239150a70076", - "sha256:e1c2ac5d0aa232c0f60fecc6bd1122346885086a176f939b91058c4c980cc226", - "sha256:e626c65a8587921ebc7fb8d31a49addfdd0b9a9aa96315ea484c09803337b955" - ], - "index": "pypi", - "version": "==2.8.0" - }, - "html5lib": { - "hashes": [ - "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868" - ], - "version": "==0.9999999" - }, - "hyperopt": { - "hashes": [ - "sha256:4f6e903f7640165ea3e4c622050b41ffab0bee7811ede23c7825a5884976d72f" - ], - "index": "pypi", - "version": "==0.1" - }, - "idna": { - "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" - ], - "version": "==2.7" - }, - "joblib": { - "hashes": [ - "sha256:9de5fe8bc953f871f862d27e77f153c31d545b84f2aa31a63b5165e912ad6dfa", - "sha256:aba9f97aa3e0548be6fc458b5d708be863eb4be35830caeb3faa7bd3d9afb7bb" - ], - "index": "pypi", - "version": "==0.12.0" - }, - "keras": { - "hashes": [ - "sha256:5b8499d157af217f1a5ee33589e774127ebc3e266c833c22cb5afbb0ed1734bf", - "sha256:fa71a1f576dbd643532b872b8952afb65cc3ff7ed20d172e6b49657b710b43d0" - ], - "index": "pypi", - "version": "==2.2.0" - }, - "keras-applications": { - "hashes": [ - "sha256:7dceb9820b39c01459ea5e8922add86eb99a9e14354c33dc9981d5f5077fa0ac", - "sha256:9924be748e5d180806d133c714d22895b997ed722757491dd99538851145d3bf" - ], - "version": "==1.0.2" - }, - "keras-preprocessing": { - "hashes": [ - "sha256:5283236f0b22a57b30bda766fc819b2ed2483c52f3e1f8b39fcc528f51f772e7", - "sha256:8649ba6377ecc06ea10e0a8a954df5600d115b4b626861e33c79b41ec03c5194" - ], - "version": "==1.0.1" - }, - "markdown": { - "hashes": [ - "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f", - "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81" - ], - "version": "==2.6.11" - }, - "mccabe": { - "hashes": [ - "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", - "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" - ], - "version": "==0.6.1" - }, - "msgpack-numpy": { - "hashes": [ - "sha256:6947df61826a2917e38dbe07957a0c70dc82dce93ec38153dae850fdd21a4583", - "sha256:afc603c7cf8497fb125a8c8c713518a004e9662101f088e3d4fcf7688b08eeb3" - ], - "version": "==0.4.1" - }, - "msgpack-python": { - "hashes": [ - "sha256:378cc8a6d3545b532dfd149da715abae4fda2a3adb6d74e525d0d5e51f46909b" - ], - "version": "==0.5.6" - }, - "murmurhash": { - "hashes": [ - "sha256:651137ed3e1169342c9edade454f3beb7fcdf28d4ad1ac232725237eaf442d9a" - ], - "version": "==0.28.0" - }, - "networkx": { - "hashes": [ - "sha256:0d0e70e10dfb47601cbb3425a00e03e2a2e97477be6f80638fef91d54dd1e4b8", - "sha256:1b229b54fe9ccb009cee4de02a88552191497a542a7d5d34adab216b9f15c1ff", - "sha256:b3e0144d5fe6b7479b694e1b598a5545a38f3fc6f1e3c09173eb30f0c7a5770e" - ], - "index": "pypi", - "version": "==1.11" - }, - "nltk": { - "hashes": [ - "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" - ], - "index": "pypi", - "version": "==3.3.0" - }, - "nose": { - "hashes": [ - "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", - "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", - "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" - ], - "index": "pypi", - "version": "==1.3.7" - }, - "numpy": { - "hashes": [ - "sha256:07379fe0b450f6fd6e5934a9bc015025bb4ce1c8fbed3ca8bef29328b1bc9570", - "sha256:085afac75bbc97a096744fcfc97a4b321c5a87220286811e85089ae04885acdd", - "sha256:2d6481c6bdab1c75affc0fc71eb1bd4b3ecef620d06f2f60c3f00521d54be04f", - "sha256:2df854df882d322d5c23087a4959e145b953dfff2abe1774fec4f639ac2f3160", - "sha256:381ad13c30cd1d0b2f3da8a0c1a4aa697487e8bb0e9e0cbeb7439776bcb645f8", - "sha256:385f1ce46e08676505b692bfde918c1e0b350963a15ef52d77691c2cf0f5dbf6", - "sha256:4d278c2261be6423c5e63d8f0ceb1b0c6db3ff83f2906f4b860db6ae99ca1bb5", - "sha256:51c5dcb51cf88b34b7d04c15f600b07c6ccbb73a089a38af2ab83c02862318da", - "sha256:589336ba5199c8061239cf446ee2f2f1fcc0c68e8531ee1382b6fc0c66b2d388", - "sha256:5edf1acc827ed139086af95ce4449b7b664f57a8c29eb755411a634be280d9f2", - "sha256:6b82b81c6b3b70ed40bc6d0b71222ebfcd6b6c04a6e7945a936e514b9113d5a3", - "sha256:6c57f973218b776195d0356e556ec932698f3a563e2f640cfca7020086383f50", - "sha256:758d1091a501fd2d75034e55e7e98bfd1370dc089160845c242db1c760d944d9", - "sha256:8622db292b766719810e0cb0f62ef6141e15fe32b04e4eb2959888319e59336b", - "sha256:8b8dcfcd630f1981f0f1e3846fae883376762a0c1b472baa35b145b911683b7b", - "sha256:97fa8f1dceffab782069b291e38c4c2227f255cdac5f1e3346666931df87373e", - "sha256:9d69967673ab7b028c2df09cae05ba56bf4e39e3cb04ebe452b6035c3b49848e", - "sha256:9e1f53afae865cc32459ad211493cf9e2a3651a7295b7a38654ef3d123808996", - "sha256:a4a433b3a264dbc9aa9c7c241e87c0358a503ea6394f8737df1683c7c9a102ac", - "sha256:baadc5f770917ada556afb7651a68176559f4dca5f4b2d0947cd15b9fb84fb51", - "sha256:c725d11990a9243e6ceffe0ab25a07c46c1cc2c5dc55e305717b5afe856c9608", - "sha256:d696a8c87315a83983fc59dd27efe034292b9e8ad667aeae51a68b4be14690d9", - "sha256:e1864a4e9f93ddb2dc6b62ccc2ec1f8250ff4ac0d3d7a15c8985dd4e1fbd6418" - ], - "index": "pypi", - "version": "==1.14.5" - }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "plac": { - "hashes": [ - "sha256:854693ad90367e8267112ffbb8955f57d6fdeac3191791dc9ffce80f87fd2370", - "sha256:ba3f719a018175f0a15a6b04e6cc79c25fd563d348aacd320c3644d2a9baf89b" - ], - "version": "==0.9.6" - }, - "preshed": { - "hashes": [ - "sha256:a6b3a9e34634600e3e410ec25e0debed4b65a47eb37514a063d189d1c425b4dd" - ], - "version": "==1.0.0" - }, - "protobuf": { - "hashes": [ - "sha256:12985d9f40c104da2f44ec089449214876809b40fdc5d9e43b93b512b9e74056", - "sha256:12c97fe27af12fc5d66b23f905ab09dd4fb0c68d5a74a419d914580e6d2e71e3", - "sha256:327fb9d8a8247bc780b9ea7ed03c0643bc0d22c139b761c9ec1efc7cc3f0923e", - "sha256:3895319db04c0b3baed74fb66be7ba9f4cd8e88a432b8e71032cdf08b2dfee23", - "sha256:695072063e256d32335d48b9484451f7c7948edc3dbd419469d6a778602682fc", - "sha256:7d786f3ef5b33a04e6538089674f244a3b0f588155016559d950989010af97d0", - "sha256:8bf82bb7a466a54be7272dcb492f71d55a2453a58d862fb74c3f2083f2768543", - "sha256:9bbc1ae1c33c1bd3a2fc05a3aec328544d2b039ff0ce6f000063628a32fad777", - "sha256:9f1087abb67b34e55108bc610936b34363a7aac692023bcbb17e065c253a1f80", - "sha256:9fefcb92a3784b446abf3641d9a14dad815bee88e0edd10b9a9e0e144d01a991", - "sha256:a37836aa47d1b81c2db1a6b7a5e79926062b5d76bd962115a0e615551be2b48d", - "sha256:cca22955443c55cf86f963a4ad7057bca95e4dcde84d6a493066d380cfab3bb0", - "sha256:d7ac50bc06d31deb07ace6de85556c1d7330e5c0958f3b2af85037d6d1182abf", - "sha256:dfe6899304b898538f4dc94fa0b281b56b70e40f58afa4c6f807805261cbe2e8" - ], - "version": "==3.6.0" - }, - "pyahocorasick": { - "hashes": [ - "sha256:3d584e7836ca7b066f99d7fdb384dc6ef7af211b2b139baedbd960c7c279bb7f" - ], - "index": "pypi", - "version": "==1.1.8" - }, - "pycodestyle": { - "hashes": [ - "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", - "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" - ], - "version": "==2.3.1" - }, - "pyflakes": { - "hashes": [ - "sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f", - "sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805" - ], - "version": "==1.6.0" - }, - "pymongo": { - "hashes": [ - "sha256:061085dfe4fbf1d9d6ed2f2e52fe6ab72559e48b4294370b433751638160d10b", - "sha256:07fdee1c5567f237796a8550233e04853785d8dcf95929f96ab519ed91543109", - "sha256:0d98731aaea8cb32b535c376f6785927e4e3d9459ffe1440b8a639827a849350", - "sha256:10f683950f70626ccedf4a662d1c0b3244e8e013c2067872af5633830abd1bfd", - "sha256:2954b99cfeb76776879e9f8a4cae9c5e19d5eff92d0b7b663ceddcf192adb66b", - "sha256:419ed5d5b76ef304815f354d9df7f2085acfd6ff7cc1b714ca702e2239b341c2", - "sha256:42ec201fd9a26e7c1e611e3db19324dead51dd4646391492eb238b41749340e8", - "sha256:4400fa92af310bf66b76c313c7ded3bb63f3d63b4f43c3bfbff552cf294dc9fa", - "sha256:4807dfbb5cdcfe0224329992dc48b897c780d0ad7553c3799d34f84ba5cab446", - "sha256:54daf67e1e7e7e5a5160c86123bdd39b1d3b25876c2ab38230dc2a764cb3d98f", - "sha256:5fd6ce5ed3c6c92d2c94756e6bf041304e5c7c5a5dbea31b8957d52a78bdf01d", - "sha256:601e00fe7fb283f04c95f5dafb787c0862f48ca015a6f1f81b460c74e4303873", - "sha256:7fbd9233e8b6741b047c5857e2ad5efb74091f167d7fa8a2a3379217165058f9", - "sha256:7ffac35362c07c103b024b89875e8d7f0625129b65c56fa8a3ecebbd56110405", - "sha256:833bc6cb2ec7058dea9f5840a9314ac74738d2117486a044e88f3976e37ea7a0", - "sha256:92cb26a2a9b38e8df5215803f950b20a6c847d5e00d1dd125eaa84f05f9472d7", - "sha256:9e5f0e8967d95a256038817460844a8aab588b9bc9ba6296507a1863960a0e44", - "sha256:abf83b908e535b1386a7732825994e6e36eff6394c1829f3e7a23888136484fa", - "sha256:adb2dba52c8a2a2d7bcd3b267f7bbf7c822850cf6a7cd15211b9f386c3a670ef", - "sha256:ae7b3479822a03f6f651913de84ba67101f23e051ae88034085e974f472dcfff", - "sha256:cc15b30f0ac518e6cbd4b6e6e6162f8aa14edfe255d0841146f146151bd58865", - "sha256:d23498d62063b715078947bef48fa4d34dc354f3b268ed15dc6b46fc809a88e9", - "sha256:dd29bb5bc9068ccc248c8c145efd839421f04363b468b47cfa2d4902ca369afe", - "sha256:e53ad0cc6c489f83e7f6bb6121aa73bb6f6488410024a3bd77c16af1aa3a1000", - "sha256:ecb11113407d919f8714cc7d0841985044633d0b561ef3d797e1b494a3e73537", - "sha256:ece2c2add66d3ec2720a963bf073ca11fc3b0b58159767fc3bc5ddaad791d481", - "sha256:ef25c8675f5c8c19832f69cd97d728d99bb4ab9c3b200e28a5c8416631afaf3c", - "sha256:f62a818d643776873713c5676f17bd95ac4176220b13dd12c14edd3a450d1ac9", - "sha256:f7ebcb846962ee40374db2d9014a89bea9c983ae63c1877957c3a0a756974796" - ], - "version": "==3.6.1" - }, - "python-crfsuite": { - "hashes": [ - "sha256:10d84507d96d2870fba053d6659170113675762745e715befe0d9671d8988098", - "sha256:19882b03d26abe075c280f3450829f520a36d17a050621e48109094ea94f4965", - "sha256:2e1cdceca173cf73360220737648acf87244c1ae98eebf9f41d7a86035d4eaac", - "sha256:3b36da634400dd1557bb947ddf009e6328e7ef76332054f6484e8cec6993b86e", - "sha256:470220d0f4be28769505c5cd8e2854ee25d748bc38d70f42811832031f245273", - "sha256:47f10949a003439cdb7922b43177c96c2c1eea5c56ed2d17270c850bc86ccab3", - "sha256:5ad7395dcce74fd07fc031f1e6c1160ff7b43ea343129caa77fb8d8afe7e2ecb", - "sha256:607f4276025b2217e326e6c9161308fb2ad7f881665bfff072dbedae6cdc9cf8", - "sha256:6c03fe809957eaf9333816f52dd497678a013ee918c502a7a529b191ef19c694", - "sha256:73b57919711499d2ff32843a28b2a6c17487630ba255a6aa4a093ee0a7f3b1fa", - "sha256:7861c9c7635e868c67914e77ccf757e096387edf676f78425aa05b2f8b32ced7", - "sha256:8b656eee39d0fdea760285b574c5517d309f943be98d87e42425d332cdf687ca", - "sha256:942aa72793dad1f9b0d65991b4386e385c4bf9248d2072e0e058f6cbb41e15dc", - "sha256:968f8597f5df1ec54c5e72ce50775118a09f3c5737962e058f94a83da30d2a68", - "sha256:96d0041722bfd9649b4f31971c25941cd9c01d4cbd4f774ba886ac6f77092f0b", - "sha256:9daca96ada19b79353a022d77f38225a1dedfb632d6107a4846f8c1b5ace88a2", - "sha256:9e18d89a1996306bf44341b55dd5cda2afa5a6aeeb4ebb99240b26bce1e89bd2", - "sha256:a8d191eac2c7a395a74ceade0e58eea0ad5de823200b7b995ecab29295cf027b", - "sha256:af63e6ed7635e5180bf278d0bd967097dcb295e0a58ba815a1c29d179479bfae", - "sha256:b5c8b6a7bc194ca189db8cae6991d7fba0e1612c5c44297b989364e096a12a03", - "sha256:b7643090e8e068ca25b5525d81a61d001e6594f6fa62452a1cb536e238b23c99", - "sha256:bd3b5d73ca86928410473be4d3246730263a36fb0afd355f4ddec450ec5f881b", - "sha256:c52d38b7180e19b6f5430eb76ae6fb160a5be2f5630834ea71d592bc5bff3671", - "sha256:c94030f625e8139a41528d546c4b41cc223ee28f301af5d780eda10dcc96364c", - "sha256:d4ce2830fd0d8ec037deefe662633935fa4a4cccc8647ac0515181c939c2184a", - "sha256:dba43d4b1ccf1b1eb67a8adbfae788f3498e21997124010ad0f819dad3cf8b1d", - "sha256:e419b441ba39716f1b9095dc3ed7adf3115611504d8933e973a7036ccef4168f", - "sha256:e7b73325a5b345d10b78018d6ca123c6348825c82f3670a169143050d087ee65" - ], - "version": "==0.9.5" - }, - "pyyaml": { - "hashes": [ - "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", - "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", - "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", - "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7", - "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", - "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", - "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", - "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269" - ], - "index": "pypi", - "version": "==3.12" - }, - "regex": { - "hashes": [ - "sha256:19c4b0f68dd97b7116e590f47d60d97ab9e76966acc321b1d20dd87c2b64dff2", - "sha256:1af6b820bec5ca82af87447af5a6dcc23b3ddc96b0184fd71666be0c24fb2a4f", - "sha256:232dbc28a2562d92d713c3c1eb2b9276f3ebcbdb6d3e96ff68d0417a71926784", - "sha256:3d26ce7e605a501509b68c343fc9d9e09f76c2e9e261df8183027bdc750c97ce", - "sha256:52b590a41b9677314d02d9055edc33992db758b3d5167aa1365229a6a0c26a6d", - "sha256:565f9aac9cd43b2351f7fcbc0d6056f8aebf4f6d049a17982085019ab9acdf28", - "sha256:656984899644d3fe2e40533724f513a21127f77162a15dd5244af3c965152c63", - "sha256:689c9d17c3ba02f52e8481a5c584c8c11ba27d6cc5f939efdd838ae0d0d1af41", - "sha256:8a9d9db8ef1621ae51ea12acb5e503204b4586e05c6cfd418aecb9466a71bd87", - "sha256:ad2beea450d551b11b47512ce920127d7c8645e528cc56dc9502c5973e8732f3", - "sha256:b39867f577bc59b2fec9209facc513c761978e4ac63f4b73b9750a2c1501729e", - "sha256:b6a7725a069be8f9dd09e1e500e5b57556b301942e21c8c712627f73ec048286", - "sha256:b9e9b97696e75e826adac1920b13e7bac3a6a2128c085783abd208d73a278d70", - "sha256:bf4896ed1ca2017153fc6b341bc8a0da8ca5480f85eebd7bfe58bbafceb4e728", - "sha256:c3c2fe1e0d90f4c93be5b588480f05defd44f64c65767a657de69c4db4429a39", - "sha256:d811874ed669165fe1059a54f860db5c6ab5f48100bf4945d915fd2f877b2531", - "sha256:db616380b04e29e5709bc3ec0674e827dfed3d18e7d686c09537ab01506127c9", - "sha256:efa66273b49dbd7a9f6a4d02d1a7d5bf353d568a89f7cd8927812daa9f83bb84", - "sha256:f8feab5b517cdc65a61a50549e7dcfa0f61ab872a0034da1f6b8d61775178b6a" - ], - "index": "pypi", - "version": "==2017.4.5" - }, - "requests": { - "hashes": [ - "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", - "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" - ], - "version": "==2.19.1" - }, - "scikit-learn": { - "hashes": [ - "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17", - "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0", - "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72", - "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2", - "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3", - "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1", - "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0", - "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049", - "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21", - "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab", - "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10", - "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2", - "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f", - "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f", - "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f", - "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2", - "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241", - "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63", - "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74", - "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26", - "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0", - "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5", - "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0" - ], - "version": "==0.19.1" - }, - "scipy": { - "hashes": [ - "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", - "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", - "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", - "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", - "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", - "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", - "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", - "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", - "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", - "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", - "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", - "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", - "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", - "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", - "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", - "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", - "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", - "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", - "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", - "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", - "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", - "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", - "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40" - ], - "index": "pypi", - "version": "==1.1.0" - }, - "seqeval": { - "hashes": [ - "sha256:6dc7f9ddf5246b909adb0c349575daedbe7828c2bc02df4c81fd4bd80ad8adaa" - ], - "version": "==0.0.3" - }, - "six": { - "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" - ], - "version": "==1.11.0" - }, - "sklearn": { - "hashes": [ - "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" - ], - "index": "pypi", - "version": "==0.0" - }, - "sklearn-crfsuite": { - "hashes": [ - "sha256:2f59aad3055e01a778a79a6352891cac04788e8b52688aa5bc8b11be7717861e", - "sha256:6e9a42bc3de96941d5f7262335130955b8c380b1356147622368f385075705d9" - ], - "index": "pypi", - "version": "==0.3.6" - }, - "spacy": { - "hashes": [ - "sha256:cddb06e7965222e4339eb59d2258db8dadab19ef8b0a1a44a2d33f94935ba421" - ], - "index": "pypi", - "version": "==2.0.11" - }, - "tabulate": { - "hashes": [ - "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" - ], - "version": "==0.8.2" - }, - "tensorboard": { - "hashes": [ - "sha256:2651a4d9261a6593cb2c3514576e4bf25e273837c79e98f68a3cf51759f68725", - "sha256:7776cc8bcfd0d07cd106e9e86c011ab8eca38c97b57f5433b9f0fd6bbf31a36e" - ], - "version": "==1.8.0" - }, - "tensorflow": { - "hashes": [ - "sha256:1fc4eb267bc973f2a8466778ad0c6f110f8356373e534c8463a7bb9b6d86e5f0", - "sha256:24c3d3f87dc6108f4de3bbb7f647bcd6f303f874721b176185f260adea88fe40", - "sha256:3087797a85610b756066df294f98e666e49f4fa60c6e1dbcf4b190b21397af71", - "sha256:36a6671d39a3aec89cb4cce125e90305c3f8a69da16d2fb72eff40fe478e1de9", - "sha256:373c637f9c4f06346b1b02e280b30517d262a1651b252ff889f7c22716ef4548", - "sha256:397c0766e166fb768613b1498a2a6f67eeea077425d234c0138d55e85408473d", - "sha256:68bda870c355aa64dc3dc377d5b9d13c829281467a7c3b95cb8529031d8f68b2", - "sha256:899e53e01efcc8e906b3d1f53fe6a6264edf5dfc275b32cadbc6f4e33dca78f7", - "sha256:92801b9aebcc195d3b5a492d1d46330c47c11c6f0c0f7ab6b605da489482c64e", - "sha256:d345d296aeb05eeb50d9de43a1dcb66ceaba6a2bd603f58aeefaa07b2c1bfac1", - "sha256:dccc205e84cd33a240a601046e88eacefe12d677a1bbdf17f0ebafd1a7c84c70", - "sha256:de51f60021ea8160ea6d0340e827a26331cd549f6e7c470fba7ee83aeab4f818" - ], - "index": "pypi", - "version": "==1.8.0" - }, - "termcolor": { - "hashes": [ - "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" - ], - "version": "==1.1.0" - }, - "thinc": { - "hashes": [ - "sha256:9a1deb850285f76efaf0ae38b605a137a3978826282cc57dcc1e66b779402a76" - ], - "version": "==6.10.2" - }, - "toolz": { - "hashes": [ - "sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9" - ], - "version": "==0.9.0" - }, - "tqdm": { - "hashes": [ - "sha256:224291ee0d8c52d91b037fd90806f48c79bcd9994d3b0abc9e44b946a908fccd", - "sha256:77b8424d41b31e68f437c6dd9cd567aebc9a860507cb42fbd880a5f822d966fe" - ], - "version": "==4.23.4" - }, - "ujson": { - "hashes": [ - "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" - ], - "version": "==1.35" - }, - "urllib3": { - "hashes": [ - "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", - "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" - ], - "version": "==1.23" - }, - "werkzeug": { - "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" - ], - "version": "==0.14.1" - }, - "wheel": { - "hashes": [ - "sha256:0a2e54558a0628f2145d2fc822137e322412115173e8a2ddbe1c9024338ae83c", - "sha256:80044e51ec5bbf6c894ba0bc48d26a8c20a9ba629f4ca19ea26ecfcf87685f5f" - ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*'", - "version": "==0.31.1" - }, - "wrapt": { - "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" - ], - "version": "==1.10.11" - } - }, - "develop": { - "absl-py": { - "hashes": [ - "sha256:e0eb8358b549552b1cc5972350bc3e41dd0a926c15b3ff95ce60f3c78c80824c" - ], - "version": "==0.2.2" - }, - "anago": { - "hashes": [ - "sha256:a4bd7b0d6109408fbdd9cdd2d6bfb60221bd7293c0645a75e6fddddce40abcc1" - ], - "index": "pypi", - "version": "==1.0.6" - }, - "astor": { - "hashes": [ - "sha256:64c805f1ad6fbc505633416b6174fc23796eb164f371a7dc1f3951ea30560fb5", - "sha256:ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d" - ], - "version": "==0.6.2" - }, - "bleach": { - "hashes": [ - "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65", - "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d" - ], - "version": "==1.5.0" - }, - "certifi": { - "hashes": [ - "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7", - "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0" - ], - "version": "==2018.4.16" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "cymem": { - "hashes": [ - "sha256:00bb3645dfb9a020d735ba3d6f822b04656388180588d8b2cebde967ee678bcc", - "sha256:0dd61d05977839a922c0d797c355b98949210575918b1743b41e38ae9fb2c3a7", - "sha256:4bc1056b52d959fcbb1e0f32ec84fa131754d6be1e36b65782c6ac86419f4bf3", - "sha256:4c5d9ca6ec706792b8d9b1faf6db77b95545c388c768b21d940f197aa7efbb7e", - "sha256:50292f4dd0d950a8698bae27d71efe59da7ff08e591b735e08b658aae42c4745", - "sha256:616d06333f46dd03c128d97912d361183fc02249e6420a7b7907b41214c51562", - "sha256:944af97d4d34a2470b5199f1c31d2dfc79cdec7bd7a41354d839a8ab87fdfaa6", - "sha256:b38056efb99078b06c504adb5f03a8d9e822a5543451737b746028a71c4b1ac3", - "sha256:b6513b2926c60d641f159e79e6fb16460dfb50ebcce31a5af0370c51837c7efc", - "sha256:daa6003fcc199752ab703142021cff74774872a932303b240dc0ea177adf295d", - "sha256:f06d9b50da0474d7405674d8101c319d89a17d33792d6d429fe3d5c64f0d9df1" - ], - "version": "==1.31.2" - }, - "cytoolz": { - "hashes": [ - "sha256:476a2ad176de5eaef80499b7b43d4f72ba6d23df33d349088dae315e9b31c552" - ], - "version": "==0.8.2" - }, - "decorator": { - "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" - ], - "version": "==4.3.0" - }, - "dill": { - "hashes": [ - "sha256:624dc244b94371bb2d6e7f40084228a2edfff02373fe20e018bef1ee92fdd5b3" - ], - "version": "==0.2.8.2" - }, - "future": { - "hashes": [ - "sha256:e39ced1ab767b5936646cedba8bcce582398233d6a627067d4c6a454c90cfedb" - ], - "index": "pypi", - "version": "==0.16.0" - }, - "gast": { - "hashes": [ - "sha256:7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930" - ], - "version": "==0.2.0" - }, - "grpcio": { - "hashes": [ - "sha256:0feade5de967be3c9ee041662d1347fc537ad05ccbcf05bcf1efa05072bef926", - "sha256:1ae02a9787cf2c5f25add0806f6271283b6074ab8619077d2b5c9037950c890b", - "sha256:2d7215dca11ba4aa49cc6c05b37e4b0a0f99727c8604e8ccd5ef1f6e06332200", - "sha256:35a4f6ffae88ce6a461e503ae91b62dc5c96013cafc717f2d7139686b5c39969", - "sha256:3be7635b4308e06449b2275a5e96a030bbf82ba6797ae8947f14667491924d81", - "sha256:3d1b3e7042a41b167334f718842f13deb80287886c9160efe31252602b13a128", - "sha256:468d4ce007cb859d5f9440cf4a7461cc172fd07d690300f4db88afaa78f01003", - "sha256:59a2fb52d286a38b9cbc7434eb473026fde0b20c223a10a99f5c3d4e395c2c2b", - "sha256:59c7670c902acce952ba709d9126cda87a45d7fed6bd568868e74171e4acd7f7", - "sha256:5b03fd3941c5e1a5deb01026bae025d319b38d3facb3e5fc491bca73e908d69e", - "sha256:65842e698776f4e49f62346c0f80fc31b34907e0df4247650c643113ef167122", - "sha256:68dbe71f890475e2824afbc5dc72714d1fca668bc15df0954bda4a8a5a53d0c7", - "sha256:868973b64b7e2464e5297cc660da588c542c175e85f6d2f7490d86c0dd5dbb4c", - "sha256:86f0c2062fde76789f7cdbf67d4ede116e7e1ceaf4c327fff7b9d17eb5852403", - "sha256:9444863aaba55b662719e22680f11134182604619f241cc607020e5b3786f4cd", - "sha256:9ac704e25d271af62c1ea72f1cb42ec7938f26f00314a8f324999ac5e1bf55eb", - "sha256:a02ef0354fb455a9ce2ad869a40f28f20a64147d46557c59b7269a15832c36d2", - "sha256:aabcdc960633231f9575252c061b480fc56a1ff6dcc7999fa5d4968f574d894f", - "sha256:b47a19a3be2f9608b4296bd16374c9a922d3206cf0a917792801a5cef5a2fa23", - "sha256:bf7bfe162057e6f1e3f4613b2a5f1157c8e286bddeaa40f7b8ce5054cb4b1413", - "sha256:c3cf3f431b41c39aa1501458d0e46086e699836536af873fe028dda1dfc6bcbd", - "sha256:cc7bd47eca988831d58a618908c825204d6ee8e90cdb9854a09b52a3b76ac168", - "sha256:cde83440fb4691d1bd8620ea919a9bd3199e6725e72d2c0d94898a2774c255ee", - "sha256:e1a03666852b956f7949c2a7f187dd54406cae2874c2ce26c1a0dafddf812cb2", - "sha256:e322eb5dc533cfbf21a9e964ebab80da391a26234a82288bfce505058913dfac", - "sha256:f0169d98670ef1db52e4f6930fd470c34731948350cabbe93087a8462b1f1da4", - "sha256:f14faadfd09aa8526536cd2149e274563f45b767fca1736ccc53803a6af3f90e" - ], - "version": "==1.12.1" - }, - "h5py": { - "hashes": [ - "sha256:0f8cd2acbacf3177b4427ed42639c911667b1f24d923388ab1f8ad466a12be5e", - "sha256:11277e3879098f921ee9e29105b20591e1dfdd44963357399f2abaa1a280c560", - "sha256:1241dec0c94ac32f3285cac1d6f44beabf80423e422ab03bd2686d731a8a9294", - "sha256:17b8187de0b3a945d8e8d031e7eb6ece2fce90791f9c5fde36f4396bf38fdde1", - "sha256:308e0758587ee16d4e73e7f2f8aae8351091e343bf0a43d2f697f9535465c816", - "sha256:37cacddf0e8209905f52537a8cf71da0dd9a4de62bd79247274c97b24a408997", - "sha256:38a23bb599748adf23d77f74885c0de6f4a7d9baa42f74e476bbf90fba2b47dd", - "sha256:47ab18b7b7bbc36fd2b606289b703b6f0ee915b923d6ad94dd17ac80ebffc280", - "sha256:486c78330af0bf33f5077b51d1888c0739c3cd1a03d5aade0d48572b3b5690ca", - "sha256:4e2183458d6ef1ae87dfb5d6acd0786359336cd9ac0ece6396c09b59fdaa3bd6", - "sha256:51d0595c3e58814c831f6cd2b664a5bf9590e26262c1d541b380d041e4fcb3c0", - "sha256:56d259d56822b70881760b243957f04a0cf133f0ec65eae6a33f562826aee899", - "sha256:5e6e777653169a3cc24ea56bb3d8c845ea391f8914c35bb6f350b0753a52891c", - "sha256:62bfb0ebb0f59e5dccc0b0dbbc0fc40dd1d1e09d04c0dc71f89790231531d4a2", - "sha256:67d89b64debfa021b54aa6f24bbf008403bd144748a0148596b518bce80d2fc4", - "sha256:9214ca445c18a37bfe9c165982c0e317e2f21f035c8d635d1c6d9fcbaf35b7a8", - "sha256:ab0c52850428d2e86029935389379c2c97f752e76b616da851deec8a4484f8ec", - "sha256:b2eff336697d8dfd712c5d93fef9f4e4d3e97d9d8c258801836b8664a239e07a", - "sha256:bb33fabc0b8f3fe3bb0f8d6821b2fad5b2a64c27a0808e8d1c5c1e3362062064", - "sha256:bd5353ab342bae1262b04745934cc1565df4cbc8d6a979a0c98f42209bd5c265", - "sha256:c45650de228ace7731e4280e14fb687f6d5c29cd666c5b22b42492b035e994d6", - "sha256:d5c0c01da45f901a3d429e7ef9e7e22baa869e1affb8715f1bf94e6a30020740", - "sha256:d75035db5bde802a29f4f29f18bb7548863d29ac90ccbf2c04c11799bbbba2c3", - "sha256:dda88206dc9464923f27f601000bc5b152ac0bd6d0122f098d4f239150a70076", - "sha256:e1c2ac5d0aa232c0f60fecc6bd1122346885086a176f939b91058c4c980cc226", - "sha256:e626c65a8587921ebc7fb8d31a49addfdd0b9a9aa96315ea484c09803337b955" - ], - "index": "pypi", - "version": "==2.8.0" - }, - "html5lib": { - "hashes": [ - "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868" - ], - "version": "==0.9999999" - }, - "hyperopt": { - "hashes": [ - "sha256:4f6e903f7640165ea3e4c622050b41ffab0bee7811ede23c7825a5884976d72f" - ], - "index": "pypi", - "version": "==0.1" - }, - "idna": { - "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" - ], - "version": "==2.7" - }, - "joblib": { - "hashes": [ - "sha256:9de5fe8bc953f871f862d27e77f153c31d545b84f2aa31a63b5165e912ad6dfa", - "sha256:aba9f97aa3e0548be6fc458b5d708be863eb4be35830caeb3faa7bd3d9afb7bb" - ], - "index": "pypi", - "version": "==0.12.0" - }, - "keras": { - "hashes": [ - "sha256:5b8499d157af217f1a5ee33589e774127ebc3e266c833c22cb5afbb0ed1734bf", - "sha256:fa71a1f576dbd643532b872b8952afb65cc3ff7ed20d172e6b49657b710b43d0" - ], - "index": "pypi", - "version": "==2.2.0" - }, - "keras-applications": { - "hashes": [ - "sha256:7dceb9820b39c01459ea5e8922add86eb99a9e14354c33dc9981d5f5077fa0ac", - "sha256:9924be748e5d180806d133c714d22895b997ed722757491dd99538851145d3bf" - ], - "version": "==1.0.2" - }, - "keras-preprocessing": { - "hashes": [ - "sha256:5283236f0b22a57b30bda766fc819b2ed2483c52f3e1f8b39fcc528f51f772e7", - "sha256:8649ba6377ecc06ea10e0a8a954df5600d115b4b626861e33c79b41ec03c5194" - ], - "version": "==1.0.1" - }, - "markdown": { - "hashes": [ - "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f", - "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81" - ], - "version": "==2.6.11" - }, - "msgpack-numpy": { - "hashes": [ - "sha256:6947df61826a2917e38dbe07957a0c70dc82dce93ec38153dae850fdd21a4583", - "sha256:afc603c7cf8497fb125a8c8c713518a004e9662101f088e3d4fcf7688b08eeb3" - ], - "version": "==0.4.1" - }, - "msgpack-python": { - "hashes": [ - "sha256:378cc8a6d3545b532dfd149da715abae4fda2a3adb6d74e525d0d5e51f46909b" - ], - "version": "==0.5.6" - }, - "murmurhash": { - "hashes": [ - "sha256:651137ed3e1169342c9edade454f3beb7fcdf28d4ad1ac232725237eaf442d9a" - ], - "version": "==0.28.0" - }, - "nerds": { - "editable": true, - "path": "." - }, - "networkx": { - "hashes": [ - "sha256:0d0e70e10dfb47601cbb3425a00e03e2a2e97477be6f80638fef91d54dd1e4b8", - "sha256:1b229b54fe9ccb009cee4de02a88552191497a542a7d5d34adab216b9f15c1ff", - "sha256:b3e0144d5fe6b7479b694e1b598a5545a38f3fc6f1e3c09173eb30f0c7a5770e" - ], - "index": "pypi", - "version": "==1.11" - }, - "nltk": { - "hashes": [ - "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" - ], - "index": "pypi", - "version": "==3.3.0" - }, - "nose": { - "hashes": [ - "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", - "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", - "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" - ], - "index": "pypi", - "version": "==1.3.7" - }, - "numpy": { - "hashes": [ - "sha256:07379fe0b450f6fd6e5934a9bc015025bb4ce1c8fbed3ca8bef29328b1bc9570", - "sha256:085afac75bbc97a096744fcfc97a4b321c5a87220286811e85089ae04885acdd", - "sha256:2d6481c6bdab1c75affc0fc71eb1bd4b3ecef620d06f2f60c3f00521d54be04f", - "sha256:2df854df882d322d5c23087a4959e145b953dfff2abe1774fec4f639ac2f3160", - "sha256:381ad13c30cd1d0b2f3da8a0c1a4aa697487e8bb0e9e0cbeb7439776bcb645f8", - "sha256:385f1ce46e08676505b692bfde918c1e0b350963a15ef52d77691c2cf0f5dbf6", - "sha256:4d278c2261be6423c5e63d8f0ceb1b0c6db3ff83f2906f4b860db6ae99ca1bb5", - "sha256:51c5dcb51cf88b34b7d04c15f600b07c6ccbb73a089a38af2ab83c02862318da", - "sha256:589336ba5199c8061239cf446ee2f2f1fcc0c68e8531ee1382b6fc0c66b2d388", - "sha256:5edf1acc827ed139086af95ce4449b7b664f57a8c29eb755411a634be280d9f2", - "sha256:6b82b81c6b3b70ed40bc6d0b71222ebfcd6b6c04a6e7945a936e514b9113d5a3", - "sha256:6c57f973218b776195d0356e556ec932698f3a563e2f640cfca7020086383f50", - "sha256:758d1091a501fd2d75034e55e7e98bfd1370dc089160845c242db1c760d944d9", - "sha256:8622db292b766719810e0cb0f62ef6141e15fe32b04e4eb2959888319e59336b", - "sha256:8b8dcfcd630f1981f0f1e3846fae883376762a0c1b472baa35b145b911683b7b", - "sha256:97fa8f1dceffab782069b291e38c4c2227f255cdac5f1e3346666931df87373e", - "sha256:9d69967673ab7b028c2df09cae05ba56bf4e39e3cb04ebe452b6035c3b49848e", - "sha256:9e1f53afae865cc32459ad211493cf9e2a3651a7295b7a38654ef3d123808996", - "sha256:a4a433b3a264dbc9aa9c7c241e87c0358a503ea6394f8737df1683c7c9a102ac", - "sha256:baadc5f770917ada556afb7651a68176559f4dca5f4b2d0947cd15b9fb84fb51", - "sha256:c725d11990a9243e6ceffe0ab25a07c46c1cc2c5dc55e305717b5afe856c9608", - "sha256:d696a8c87315a83983fc59dd27efe034292b9e8ad667aeae51a68b4be14690d9", - "sha256:e1864a4e9f93ddb2dc6b62ccc2ec1f8250ff4ac0d3d7a15c8985dd4e1fbd6418" - ], - "index": "pypi", - "version": "==1.14.5" - }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "plac": { - "hashes": [ - "sha256:854693ad90367e8267112ffbb8955f57d6fdeac3191791dc9ffce80f87fd2370", - "sha256:ba3f719a018175f0a15a6b04e6cc79c25fd563d348aacd320c3644d2a9baf89b" - ], - "version": "==0.9.6" - }, - "preshed": { - "hashes": [ - "sha256:a6b3a9e34634600e3e410ec25e0debed4b65a47eb37514a063d189d1c425b4dd" - ], - "version": "==1.0.0" - }, - "protobuf": { - "hashes": [ - "sha256:12985d9f40c104da2f44ec089449214876809b40fdc5d9e43b93b512b9e74056", - "sha256:12c97fe27af12fc5d66b23f905ab09dd4fb0c68d5a74a419d914580e6d2e71e3", - "sha256:327fb9d8a8247bc780b9ea7ed03c0643bc0d22c139b761c9ec1efc7cc3f0923e", - "sha256:3895319db04c0b3baed74fb66be7ba9f4cd8e88a432b8e71032cdf08b2dfee23", - "sha256:695072063e256d32335d48b9484451f7c7948edc3dbd419469d6a778602682fc", - "sha256:7d786f3ef5b33a04e6538089674f244a3b0f588155016559d950989010af97d0", - "sha256:8bf82bb7a466a54be7272dcb492f71d55a2453a58d862fb74c3f2083f2768543", - "sha256:9bbc1ae1c33c1bd3a2fc05a3aec328544d2b039ff0ce6f000063628a32fad777", - "sha256:9f1087abb67b34e55108bc610936b34363a7aac692023bcbb17e065c253a1f80", - "sha256:9fefcb92a3784b446abf3641d9a14dad815bee88e0edd10b9a9e0e144d01a991", - "sha256:a37836aa47d1b81c2db1a6b7a5e79926062b5d76bd962115a0e615551be2b48d", - "sha256:cca22955443c55cf86f963a4ad7057bca95e4dcde84d6a493066d380cfab3bb0", - "sha256:d7ac50bc06d31deb07ace6de85556c1d7330e5c0958f3b2af85037d6d1182abf", - "sha256:dfe6899304b898538f4dc94fa0b281b56b70e40f58afa4c6f807805261cbe2e8" - ], - "version": "==3.6.0" - }, - "pyahocorasick": { - "hashes": [ - "sha256:3d584e7836ca7b066f99d7fdb384dc6ef7af211b2b139baedbd960c7c279bb7f" - ], - "index": "pypi", - "version": "==1.1.8" - }, - "pymongo": { - "hashes": [ - "sha256:061085dfe4fbf1d9d6ed2f2e52fe6ab72559e48b4294370b433751638160d10b", - "sha256:07fdee1c5567f237796a8550233e04853785d8dcf95929f96ab519ed91543109", - "sha256:0d98731aaea8cb32b535c376f6785927e4e3d9459ffe1440b8a639827a849350", - "sha256:10f683950f70626ccedf4a662d1c0b3244e8e013c2067872af5633830abd1bfd", - "sha256:2954b99cfeb76776879e9f8a4cae9c5e19d5eff92d0b7b663ceddcf192adb66b", - "sha256:419ed5d5b76ef304815f354d9df7f2085acfd6ff7cc1b714ca702e2239b341c2", - "sha256:42ec201fd9a26e7c1e611e3db19324dead51dd4646391492eb238b41749340e8", - "sha256:4400fa92af310bf66b76c313c7ded3bb63f3d63b4f43c3bfbff552cf294dc9fa", - "sha256:4807dfbb5cdcfe0224329992dc48b897c780d0ad7553c3799d34f84ba5cab446", - "sha256:54daf67e1e7e7e5a5160c86123bdd39b1d3b25876c2ab38230dc2a764cb3d98f", - "sha256:5fd6ce5ed3c6c92d2c94756e6bf041304e5c7c5a5dbea31b8957d52a78bdf01d", - "sha256:601e00fe7fb283f04c95f5dafb787c0862f48ca015a6f1f81b460c74e4303873", - "sha256:7fbd9233e8b6741b047c5857e2ad5efb74091f167d7fa8a2a3379217165058f9", - "sha256:7ffac35362c07c103b024b89875e8d7f0625129b65c56fa8a3ecebbd56110405", - "sha256:833bc6cb2ec7058dea9f5840a9314ac74738d2117486a044e88f3976e37ea7a0", - "sha256:92cb26a2a9b38e8df5215803f950b20a6c847d5e00d1dd125eaa84f05f9472d7", - "sha256:9e5f0e8967d95a256038817460844a8aab588b9bc9ba6296507a1863960a0e44", - "sha256:abf83b908e535b1386a7732825994e6e36eff6394c1829f3e7a23888136484fa", - "sha256:adb2dba52c8a2a2d7bcd3b267f7bbf7c822850cf6a7cd15211b9f386c3a670ef", - "sha256:ae7b3479822a03f6f651913de84ba67101f23e051ae88034085e974f472dcfff", - "sha256:cc15b30f0ac518e6cbd4b6e6e6162f8aa14edfe255d0841146f146151bd58865", - "sha256:d23498d62063b715078947bef48fa4d34dc354f3b268ed15dc6b46fc809a88e9", - "sha256:dd29bb5bc9068ccc248c8c145efd839421f04363b468b47cfa2d4902ca369afe", - "sha256:e53ad0cc6c489f83e7f6bb6121aa73bb6f6488410024a3bd77c16af1aa3a1000", - "sha256:ecb11113407d919f8714cc7d0841985044633d0b561ef3d797e1b494a3e73537", - "sha256:ece2c2add66d3ec2720a963bf073ca11fc3b0b58159767fc3bc5ddaad791d481", - "sha256:ef25c8675f5c8c19832f69cd97d728d99bb4ab9c3b200e28a5c8416631afaf3c", - "sha256:f62a818d643776873713c5676f17bd95ac4176220b13dd12c14edd3a450d1ac9", - "sha256:f7ebcb846962ee40374db2d9014a89bea9c983ae63c1877957c3a0a756974796" - ], - "version": "==3.6.1" - }, - "python-crfsuite": { - "hashes": [ - "sha256:10d84507d96d2870fba053d6659170113675762745e715befe0d9671d8988098", - "sha256:19882b03d26abe075c280f3450829f520a36d17a050621e48109094ea94f4965", - "sha256:2e1cdceca173cf73360220737648acf87244c1ae98eebf9f41d7a86035d4eaac", - "sha256:3b36da634400dd1557bb947ddf009e6328e7ef76332054f6484e8cec6993b86e", - "sha256:470220d0f4be28769505c5cd8e2854ee25d748bc38d70f42811832031f245273", - "sha256:47f10949a003439cdb7922b43177c96c2c1eea5c56ed2d17270c850bc86ccab3", - "sha256:5ad7395dcce74fd07fc031f1e6c1160ff7b43ea343129caa77fb8d8afe7e2ecb", - "sha256:607f4276025b2217e326e6c9161308fb2ad7f881665bfff072dbedae6cdc9cf8", - "sha256:6c03fe809957eaf9333816f52dd497678a013ee918c502a7a529b191ef19c694", - "sha256:73b57919711499d2ff32843a28b2a6c17487630ba255a6aa4a093ee0a7f3b1fa", - "sha256:7861c9c7635e868c67914e77ccf757e096387edf676f78425aa05b2f8b32ced7", - "sha256:8b656eee39d0fdea760285b574c5517d309f943be98d87e42425d332cdf687ca", - "sha256:942aa72793dad1f9b0d65991b4386e385c4bf9248d2072e0e058f6cbb41e15dc", - "sha256:968f8597f5df1ec54c5e72ce50775118a09f3c5737962e058f94a83da30d2a68", - "sha256:96d0041722bfd9649b4f31971c25941cd9c01d4cbd4f774ba886ac6f77092f0b", - "sha256:9daca96ada19b79353a022d77f38225a1dedfb632d6107a4846f8c1b5ace88a2", - "sha256:9e18d89a1996306bf44341b55dd5cda2afa5a6aeeb4ebb99240b26bce1e89bd2", - "sha256:a8d191eac2c7a395a74ceade0e58eea0ad5de823200b7b995ecab29295cf027b", - "sha256:af63e6ed7635e5180bf278d0bd967097dcb295e0a58ba815a1c29d179479bfae", - "sha256:b5c8b6a7bc194ca189db8cae6991d7fba0e1612c5c44297b989364e096a12a03", - "sha256:b7643090e8e068ca25b5525d81a61d001e6594f6fa62452a1cb536e238b23c99", - "sha256:bd3b5d73ca86928410473be4d3246730263a36fb0afd355f4ddec450ec5f881b", - "sha256:c52d38b7180e19b6f5430eb76ae6fb160a5be2f5630834ea71d592bc5bff3671", - "sha256:c94030f625e8139a41528d546c4b41cc223ee28f301af5d780eda10dcc96364c", - "sha256:d4ce2830fd0d8ec037deefe662633935fa4a4cccc8647ac0515181c939c2184a", - "sha256:dba43d4b1ccf1b1eb67a8adbfae788f3498e21997124010ad0f819dad3cf8b1d", - "sha256:e419b441ba39716f1b9095dc3ed7adf3115611504d8933e973a7036ccef4168f", - "sha256:e7b73325a5b345d10b78018d6ca123c6348825c82f3670a169143050d087ee65" - ], - "version": "==0.9.5" - }, - "pyyaml": { - "hashes": [ - "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", - "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", - "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", - "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7", - "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", - "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", - "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", - "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269" - ], - "index": "pypi", - "version": "==3.12" - }, - "regex": { - "hashes": [ - "sha256:19c4b0f68dd97b7116e590f47d60d97ab9e76966acc321b1d20dd87c2b64dff2", - "sha256:1af6b820bec5ca82af87447af5a6dcc23b3ddc96b0184fd71666be0c24fb2a4f", - "sha256:232dbc28a2562d92d713c3c1eb2b9276f3ebcbdb6d3e96ff68d0417a71926784", - "sha256:3d26ce7e605a501509b68c343fc9d9e09f76c2e9e261df8183027bdc750c97ce", - "sha256:52b590a41b9677314d02d9055edc33992db758b3d5167aa1365229a6a0c26a6d", - "sha256:565f9aac9cd43b2351f7fcbc0d6056f8aebf4f6d049a17982085019ab9acdf28", - "sha256:656984899644d3fe2e40533724f513a21127f77162a15dd5244af3c965152c63", - "sha256:689c9d17c3ba02f52e8481a5c584c8c11ba27d6cc5f939efdd838ae0d0d1af41", - "sha256:8a9d9db8ef1621ae51ea12acb5e503204b4586e05c6cfd418aecb9466a71bd87", - "sha256:ad2beea450d551b11b47512ce920127d7c8645e528cc56dc9502c5973e8732f3", - "sha256:b39867f577bc59b2fec9209facc513c761978e4ac63f4b73b9750a2c1501729e", - "sha256:b6a7725a069be8f9dd09e1e500e5b57556b301942e21c8c712627f73ec048286", - "sha256:b9e9b97696e75e826adac1920b13e7bac3a6a2128c085783abd208d73a278d70", - "sha256:bf4896ed1ca2017153fc6b341bc8a0da8ca5480f85eebd7bfe58bbafceb4e728", - "sha256:c3c2fe1e0d90f4c93be5b588480f05defd44f64c65767a657de69c4db4429a39", - "sha256:d811874ed669165fe1059a54f860db5c6ab5f48100bf4945d915fd2f877b2531", - "sha256:db616380b04e29e5709bc3ec0674e827dfed3d18e7d686c09537ab01506127c9", - "sha256:efa66273b49dbd7a9f6a4d02d1a7d5bf353d568a89f7cd8927812daa9f83bb84", - "sha256:f8feab5b517cdc65a61a50549e7dcfa0f61ab872a0034da1f6b8d61775178b6a" - ], - "index": "pypi", - "version": "==2017.4.5" - }, - "requests": { - "hashes": [ - "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", - "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" - ], - "version": "==2.19.1" - }, - "scikit-learn": { - "hashes": [ - "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17", - "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0", - "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72", - "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2", - "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3", - "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1", - "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0", - "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049", - "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21", - "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab", - "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10", - "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2", - "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f", - "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f", - "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f", - "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2", - "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241", - "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63", - "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74", - "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26", - "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0", - "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5", - "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0" - ], - "version": "==0.19.1" - }, - "scipy": { - "hashes": [ - "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", - "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", - "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", - "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", - "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", - "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", - "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", - "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", - "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", - "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", - "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", - "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", - "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", - "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", - "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", - "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", - "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", - "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", - "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", - "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", - "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", - "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", - "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40" - ], - "index": "pypi", - "version": "==1.1.0" - }, - "seqeval": { - "hashes": [ - "sha256:6dc7f9ddf5246b909adb0c349575daedbe7828c2bc02df4c81fd4bd80ad8adaa" - ], - "version": "==0.0.3" - }, - "six": { - "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" - ], - "version": "==1.11.0" - }, - "sklearn": { - "hashes": [ - "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" - ], - "index": "pypi", - "version": "==0.0" - }, - "sklearn-crfsuite": { - "hashes": [ - "sha256:2f59aad3055e01a778a79a6352891cac04788e8b52688aa5bc8b11be7717861e", - "sha256:6e9a42bc3de96941d5f7262335130955b8c380b1356147622368f385075705d9" - ], - "index": "pypi", - "version": "==0.3.6" - }, - "spacy": { - "hashes": [ - "sha256:cddb06e7965222e4339eb59d2258db8dadab19ef8b0a1a44a2d33f94935ba421" - ], - "index": "pypi", - "version": "==2.0.11" - }, - "tabulate": { - "hashes": [ - "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" - ], - "version": "==0.8.2" - }, - "tensorboard": { - "hashes": [ - "sha256:2651a4d9261a6593cb2c3514576e4bf25e273837c79e98f68a3cf51759f68725", - "sha256:7776cc8bcfd0d07cd106e9e86c011ab8eca38c97b57f5433b9f0fd6bbf31a36e" - ], - "version": "==1.8.0" - }, - "tensorflow": { - "hashes": [ - "sha256:1fc4eb267bc973f2a8466778ad0c6f110f8356373e534c8463a7bb9b6d86e5f0", - "sha256:24c3d3f87dc6108f4de3bbb7f647bcd6f303f874721b176185f260adea88fe40", - "sha256:3087797a85610b756066df294f98e666e49f4fa60c6e1dbcf4b190b21397af71", - "sha256:36a6671d39a3aec89cb4cce125e90305c3f8a69da16d2fb72eff40fe478e1de9", - "sha256:373c637f9c4f06346b1b02e280b30517d262a1651b252ff889f7c22716ef4548", - "sha256:397c0766e166fb768613b1498a2a6f67eeea077425d234c0138d55e85408473d", - "sha256:68bda870c355aa64dc3dc377d5b9d13c829281467a7c3b95cb8529031d8f68b2", - "sha256:899e53e01efcc8e906b3d1f53fe6a6264edf5dfc275b32cadbc6f4e33dca78f7", - "sha256:92801b9aebcc195d3b5a492d1d46330c47c11c6f0c0f7ab6b605da489482c64e", - "sha256:d345d296aeb05eeb50d9de43a1dcb66ceaba6a2bd603f58aeefaa07b2c1bfac1", - "sha256:dccc205e84cd33a240a601046e88eacefe12d677a1bbdf17f0ebafd1a7c84c70", - "sha256:de51f60021ea8160ea6d0340e827a26331cd549f6e7c470fba7ee83aeab4f818" - ], - "index": "pypi", - "version": "==1.8.0" - }, - "termcolor": { - "hashes": [ - "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" - ], - "version": "==1.1.0" - }, - "thinc": { - "hashes": [ - "sha256:9a1deb850285f76efaf0ae38b605a137a3978826282cc57dcc1e66b779402a76" - ], - "version": "==6.10.2" - }, - "toolz": { - "hashes": [ - "sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9" - ], - "version": "==0.9.0" - }, - "tqdm": { - "hashes": [ - "sha256:224291ee0d8c52d91b037fd90806f48c79bcd9994d3b0abc9e44b946a908fccd", - "sha256:77b8424d41b31e68f437c6dd9cd567aebc9a860507cb42fbd880a5f822d966fe" - ], - "version": "==4.23.4" - }, - "ujson": { - "hashes": [ - "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" - ], - "version": "==1.35" - }, - "urllib3": { - "hashes": [ - "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", - "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" - ], - "version": "==1.23" - }, - "werkzeug": { - "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" - ], - "version": "==0.14.1" - }, - "wheel": { - "hashes": [ - "sha256:0a2e54558a0628f2145d2fc822137e322412115173e8a2ddbe1c9024338ae83c", - "sha256:80044e51ec5bbf6c894ba0bc48d26a8c20a9ba629f4ca19ea26ecfcf87685f5f" - ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*'", - "version": "==0.31.1" - }, - "wrapt": { - "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" - ], - "version": "==1.10.11" - } - } -}