elsevierlabs-os · sujitpal · Jul 5, 2019 · Oct 23, 2019 · Oct 23, 2019 · Oct 28, 2019
diff --git a/Makefile b/Makefile
@@ -1,9 +1,8 @@
 install:
-	pipenv install --dev -e .
+	pip install -e .
 
 download_models:
-	pipenv run python -m spacy download en
-	pipenv run python -m nltk.downloader averaged_perceptron_tagger
+	python -m spacy download en
 
 clean:
 	-rm -rf build
@@ -17,11 +16,11 @@ clean_test:
 dist:
 	make clean
 	make download_models
-	pipenv run python setup.py bdist_wheel --dist-dir target
+	python setup.py bdist_wheel --dist-dir target 
 
 test:
 	make clean_test
-	pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds
+	nosetests --with-coverage --cover-html -s --verbosity=2 --cover-package=nerds nerds/test/
 
 lint:
-	pipenv run flake8 nerds --verbose
+	flake8 --ignore=W605,W504 --verbose nerds
diff --git a/Makefile.pipenv b/Makefile.pipenv
@@ -0,0 +1,27 @@
+install:
+	pipenv install --dev -e .
+
+download_models:
+	pipenv run python -m spacy download en
+	pipenv run python -m nltk.downloader averaged_perceptron_tagger
+
+clean:
+	-rm -rf build
+	-rm -rf target
+	-find . -name "__pycache__" -type d -depth -exec rm -rf {} \;
+
+clean_test:
+	-rm -rf cover
+	-rm .coverage
+
+dist:
+	make clean
+	make download_models
+	pipenv run python setup.py bdist_wheel --dist-dir target
+
+test:
+	make clean_test
+	pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds
+
+lint:
+	pipenv run flake8 nerds --verbose
diff --git a/Pipfile b/Pipfile
@@ -20,7 +20,7 @@ scipy = "*"
 sklearn = "*"
 sklearn-crfsuite = "*"
 spacy = "==2.0.11"
-tensorflow = "*"
+tensorflow >= "1.15.2"
 nose = "*"
 coverage = "*"
 "flake8" = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # nerds
-![nerds logo](nerds.png)
+![nerds logo](docs/nerds.png)
 
 # How to set up a DEV environment
 
@@ -15,9 +15,29 @@ pip3 install pipenv
 
 This will make sure that `pipenv` uses your latest version of Python3, which is hopefully 3.6 or higher. Please refer to the [official website](https://docs.pipenv.org/) for more information on `pipenv`.
 
-A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily.
+A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily. Note that this is the preferred environment setup approach, the `Pipfile` and `Pipfile.lock` files ensure that you automatically have access to the installed packages in `requirements.txt` after you do a `make install` (see below).
 
-### Makefile specifications
+## Setting up the environment using `conda`
+
+Alternatively, if you are using the [Anaconda distribution of Python](https://www.anaconda.com/), you can also use `conda` to create an environment using the following command:
+
+```
+conda create -n nerds python=3.6 anaconda
+```
+
+You can then enter the newly created conda environment using the following command. After you run the various `make ...` commands, the packages listed in `requirements.txt` and the downloaded models will only be visible inside the `nerds` environment. This approach is usually preferred since it can help prevent version collisions between different environments, at the cost of more disk space.
+
+```
+conda activate nerds
+```
+
+and exit the environment using the following command.
+
+```
+conda deactivate
+```
+
+## Makefile specifications
 
 To install all of the required packages for development and testing run:
 
@@ -55,41 +75,65 @@ NERDS is a framework that provides some NER capabilities - among which the optio
 
 ## Understanding the main data exchange classes
 
-There are 3 main classes in the `nerds.core.model.input.*` package that are used in our NER models: `Document`, `Annotation` and `AnnotatedDocument`.
-
-A `Document` class is the abstract representation of a raw document. It should always implement the `plain_text_` attribute, that returns the plain text representation of the object, as it's the one where we are going to perform NER. Therefore, whenever we want to process any new type of document format - XML, PDF, JSON, brat, etc. - the only requirement is to write an adapter that reads the file(s) from an input directory and transforms them to `Document` objects. The default `Document` object works seamlessly with `.txt` files.
+The NERDS master project on [elsevierlabs-os/nerds](https://github.com/elsevierlabs-os/nerds) project uses a set of custom data exchange classes `Document`, `Annotation`, and `AnnotatedDocument`. The project provided a set of conversion utilities which could be used to convert provided datasets to this format, and convert instances of these classes back to whatever format the underlying wrapped NER model needed. However, this NERDS fork on [sujitpal/nerds](https://github.com/sujitpal/nerds) eliminates this requirement -- the internal format is just a list of list of tokens (words in sentence) or BIO tags. The utility function `nerds.utils.load_data_and_labels` can read a file in CoNLL BIO format and convert to this internal format. This decision was made because 3 of the 5 provided models consume the list of list format natively, and the result is fewer lines of extra code and less potential for error.
 
-The `Annotation` class contains the data for a single annotation. This is the text (e.g. "fox"), the label (e.g. "ANIMAL") and the offsets that correspond to offsets in the `plain_text_` representation of a `Document` (e.g. 40-42).
+In general, when given an input format that is not in CoNLL BIO format, the main effort in using NERDS would be to convert it to CoNLL BIO format. Once that is done, it is relatively easy to ingest it into a data and label structure, as shown below.
 
-> **Important to note**: The offsets is a 2-tuple of integers that represent the position of the first and the last character of the annotation. Be careful, because some libraries end the offset one character **after** the final character i.e. at `start_offset + len(word)`. This is not the case with us, we currently end the offsets at **exactly** the final character i.e. at `start_offset + len(word) - 1`.
+```python
+from nerds.utils import load_data_and_labels
 
-Finally, the `AnnotatedDocument` class is a combination of `Document` and a list of `Annotation`, and it can represent two things:
+data, labels = load_data_and_labels("nerds/test/data/example.iob")
+print("data:", data)
+print("labels:", labels)
+```
 
-*  Ground truth data (e.g. brat annotation files).
-*  Predictions on documents after they run through our NER models.
+yields the following output.
 
-The `AnnotatedDocument` class exposes the `annotated_text_` attribute which returns the plain text representation of the document with inline annotations.
+```
+data: [
+  ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.'], 
+  ['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.', ',', 'the', 'Dutch', 'publishing', 'group', '.']
+]
+labels [
+  ['B-PER', 'I-PER', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O'], 
+  ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O']
+]
+```
 
 ## Extending the base model class
 
-The basic class that every model needs to extend is the `NERModel` class in the `nerds.core.model.ner.base` package. The model class implements a `fit - transform` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum:
+The basic class that every model needs to extend is the `NERModel` class in the `nerds.models` package. The model class implements a `fit - predict` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum:
 
-*  `fit`: Trains a model given a list of `AnnotatedDocument` objects.
-*  `transform`: Gets a list of `Document` objects and transforms them to `AnnotatedDocument`.
-*  `save`: Disk persistence of a model.
-*  `load`: Disk persistence of a model.
+*  `fit(X, y)`: Trains a model given a list of list of tokens X and BIO tags y.
+*  `predict(X)`: Returns a list of list of BIO tags, given a list of list of tokens X.
+*  `save(dirpath)`: Saves model to directory given by dirpath.
+*  `load(dirpath)`: Retrieves model from directory given by dirpath.
 
-Please note that **all** of the class methods, utility functions, etc. should operate on `Document` and `AnnotatedDocument` objects, to maintain compatibility with the rest of the framework. The only exception is "private" methods used internally in classes.
+As a best practice, I like to implement a single NER model (or group of related NER models) as a single file in the `models` folder, but have it be accessible from client code directly as `nerds.models.CustomNER`. You can set this redirection up in `nerds/models/__init__.py`.
 
 # Running experiments
 
-So, let's assume you have a dataset that contains annotated text. If it's in a format that is already supported (e.g. [brat](http://brat.nlplab.org/standoff.html)), then you may just load it into `AnnotatedDocument` objects using the built-in classes. Otherwise, you will have to extend the `nerds.core.model.input.DataInput` class to support the format. Then, you may use the built-in NER models (or create your own) either alone, or in an ensemble and evaluate their predictive capabilities on your dataset.
+There are two examples of running experiments using NERDS. We will continue to update these examples as more functionality becomes available.
 
-In the `nerds.core.model.evaluate` package, there are helper methods and classes to perform k-fold cross-validation. Please, refer to the `nerds.examples` package where you may look at working code examples with real datasets.
+* [examples/GMB](examples/GMB)
+* [examples/BioNLP](examples/BioNLP)
 
 # Contributing to the project
 
 New models and input adapters are always welcome. Please make sure your code is well-documented and readable. Before creating a pull request make sure:
 
 * `make test` shows that all the unit test pass.
 * `make lint` shows no Python code violations.
+
+The [CONTRIBUTING.md file](docs/CONTRIBUTING.md) lists contributors who have contributed to the [NERDS (elsevierlabs-os/nerds)](https://github.com/elsevierlabs-os/nerds) project.
+
+# Changes / Improvements in this Fork
+
+The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that were made in this fork.
+
+# Talks and Blogs
+
+* \[slides\] [Slides for talk at PyData LA 2019](https://www.slideshare.net/sujitpal/building-named-entity-recognition-models-efficiently-using-nerds).
+* \[video\] [Video of talk at PyData LA 2019](https://www.youtube.com/watch?v=ilzFiK0nAh8).
+* \[blog\] [Incorporating third party NER (Flair) into NERDS](https://sujitpal.blogspot.com/2019/12/incorporating-flair-ner-into-nerds.html).
+* \[blog\] [Adding a Transformer based NER model into NERDS](https://sujitpal.blogspot.com/2020/01/adding-transformer-based-ner-model-into.html).
diff --git a/converters/brat2iob.py b/converters/brat2iob.py
@@ -0,0 +1,178 @@
+import argparse
+import operator
+import os
+import re
+import shutil
+import spacy
+import tempfile
+
+from nerds.utils import spans_to_tokens, get_logger
+
+def segment_text_to_sentences(text_file, sentence_splitter):
+    """ Segment text into sentences. Text is provided by BRAT in .txt 
+        file.
+
+        Args:
+            text_file (str): the full path to the BRAT .txt file.
+            sentence_splitter (spacy LM): SpaCy EN language model.
+
+        Returns:
+            sentences (list((int, int, str))): list of sentence spans. 
+                Spans are triples of (start_offset, end_offset, text),
+                where offset is relative to the text.
+    """
+    sentences = []
+    ftext = open(text_file, "r")
+    for line in ftext:
+        splits = sentence_splitter(line.strip())
+        for sent in splits.sents:
+            sentences.append((sent.start_char, sent.end_char, sent.text))
+    ftext.close()
+    return sentences
+
+
+def parse_text_annotations(ann_file):
+    """ Parses BRAT annotations provided in the .ann file and converts them
+        to annotation spans of (start_position, end_position, entity_class).
+
+        Args:
+            ann_file (str): full path to the BRAT .ann file.
+
+        Returns:
+            annotations (list((int, int, str))): list of annotation spans.
+                Spans are triples of (start_offset, end_offset, entity_class)
+                where offset is relative to the text.
+    """
+    annots = []
+    fann = open(ann_file, "r")
+    for line in fann:
+        cols = re.split(r"\s+", line.strip())
+        if not cols[0].startswith("T"):
+            continue
+        annots.append((int(cols[2]), int(cols[3]), cols[1]))
+    fann.close()
+    return annots
+
+
+def apply_annotations(sentences, annotations, tokenizer):
+    """ Apply annotation spans to the sentence spans to create a list of tokens
+        and tags.
+
+        Args:
+            sentences (list((int, int, str))): list of sentence spans.
+            annotations (list((int, int, str))): list of annotation spans.
+            tokenizer (spacy LM): SpaCy EN language model.
+
+        Returns:
+            tokens_tags_list (list((list(str), list(str)))): list of list of token
+                tag pairs. Each list of token-tag pairs corresponds to a single 
+                sentence.
+    """
+    tokens_tags_list = []
+    for sent_start, sent_end, sent_text in sentences:
+        sent_annots = [a for a in annotations if a[0] >= sent_start and a[1] <= sent_end]
+        # convert document offsets to sentence offsets
+        sent_annots = [(s[0] - sent_start, s[1] - sent_start, s[2]) for s in sent_annots]
+        tokens, tags = spans_to_tokens(sent_text, sent_annots, tokenizer)
+        tokens_tags_list.append(zip(tokens, tags))
+    return tokens_tags_list
+
+
+def convert_brat_to_iob(input_dir, output_file, nlp):
+    """ Convenience Convertor function.
+
+        Args:
+            input_dir (str): the directory where the BRAT .txt and .ann files
+                are located.
+            output_file (str): the full path name of file to write output in
+                IOB format to.
+            nlp (SpaCy LM): reference to the SpaCy EN model.
+
+        Returns: 
+            None.
+    """
+    fout = open(output_file, "w")
+    for text_file in os.listdir(input_dir):
+        # only process .txt and .ann pairs in specified directory
+        if not text_file.endswith(".txt"):
+            continue
+        annot_file = text_file[:-4] + ".ann"
+        if not os.path.exists(os.path.join(input_dir, annot_file)):
+            # do not process file if no corresponding .ann file
+            continue
+        # process file pair
+        logger.info("Processing file: {:s}".format(text_file))
+        sentences = segment_text_to_sentences(os.path.join(input_dir, text_file), nlp)
+        annotations = parse_text_annotations(os.path.join(input_dir, annot_file))
+        tokens_tags_list = apply_annotations(sentences, annotations, nlp)
+        for tokens_tags in tokens_tags_list:
+            for token, tag in tokens_tags:
+                fout.write("{:s}\t{:s}\n".format(token, tag))
+            fout.write("\n")
+
+    fout.close()
+
+
+def do_self_test(nlp):
+    """ Simple self-test with small dataset to prove that this works okay. """
+    text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director, Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group."
+    annotations = [
+        "T1	PER 0 13	Pierre Vinken",
+        "T2	PER 86 96	Mr. Vinken",
+        "T3	DATE 15 27	61 years old",
+        "T4	DATE 77 84	Nov. 29",
+        "T5	ORG 112 125	Elsevier N.V.",
+        "T6	NORP 131 136	Dutch"
+    ]
+    input_dir = tempfile.mkdtemp(dir="/tmp")
+    ftext = open(os.path.join(input_dir, "test.txt"), "w")
+    ftext.write(text)
+    ftext.close()
+    fann = open(os.path.join(input_dir, "test.ann"), "w")
+    for line in annotations:
+        fann.write(line + "\n")
+    fann.close()
+    output_file = os.path.join(input_dir, "test.iob")
+    convert_brat_to_iob(input_dir, output_file, nlp)
+    fout = open(output_file, "r")
+    for line in fout:
+        logger.warn(line.strip())
+    shutil.rmtree(input_dir)
+
+
+################################ main ################################
+#
+# usage: brat2iob.py [-h] [-i INPUT_DIR] [-o OUTPUT_FILE] [-t]
+# Script to convert BRAT annotations to IOB (NERDS) format.
+# optional arguments:
+#   -h, --help            show this help message and exit
+#   -i INPUT_DIR, --input_dir INPUT_DIR
+#                         Directory to store BRAT .txt and .ann files.
+#   -o OUTPUT_FILE, --output_file OUTPUT_FILE
+#                         Output file to write IOB output to.
+#   -t, --test            Runs self test.
+######################################################################
+
+parser = argparse.ArgumentParser(
+    description="Script to convert BRAT annotations to IOB (NERDS) format.")
+parser.add_argument("-i", "--input_dir", help="Directory to store BRAT .txt and .ann files.")
+parser.add_argument("-o", "--output_file", help="Output file to write IOB output to.")
+parser.add_argument("-t", "--test", help="Runs self test.", action="store_true")
+args = parser.parse_args()
+
+logger = get_logger()
+
+input_dir = args.input_dir
+output_file = args.output_file
+self_test = args.test
+
+nlp = spacy.load("en")
+
+if self_test:
+    logger.info("Executing self test...")
+    do_self_test(nlp)
+else:
+    logger.info("Reading BRAT .txt and .ann files from: {:s}".format(input_dir))
+    logger.info("Writing IOB tokens/tags to file: {:s}".format(output_file))
+    convert_brat_to_iob(input_dir, output_file, nlp)
+