NVIDIA
diff --git a/‎TensorFlow/LanguageModeling/BERT/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎TensorFlow/LanguageModeling/BERT/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎TensorFlow/LanguageModeling/BERT/README.md‎
Lines changed: 2 additions & 2 deletions b/‎TensorFlow/LanguageModeling/BERT/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎TensorFlow/LanguageModeling/BERT/data/ChemProtTextFormatting.py‎
Lines changed: 169 additions & 0 deletions b/‎TensorFlow/LanguageModeling/BERT/data/ChemProtTextFormatting.py‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎TensorFlow/LanguageModeling/BERT/data/Downloader.py‎
Lines changed: 3 additions & 0 deletions b/‎TensorFlow/LanguageModeling/BERT/data/Downloader.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎TensorFlow/LanguageModeling/BERT/data/bertPrep.py‎
Lines changed: 2 additions & 1 deletion b/‎TensorFlow/LanguageModeling/BERT/data/bertPrep.py‎
Lines changed: 2 additions & 1 deletion
@@ -17,12 +17,12 @@ RUN git clone https://github.com/titipata/pubmed_parser
 RUN pip3 install /workspace/pubmed_parser
 
 #Copy the perf_client over
-ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.14.0/v1.14.0_ubuntu1804.clients.tar.gz
+ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.0.0/v2.0.0_ubuntu1804.clients.tar.gz
 RUN mkdir -p /workspace/install \
     && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
 
 #Install the python wheel with pip
-RUN pip install /workspace/install/python/tensorrtserver-1.14.0-py3-none-linux_x86_64.whl
+RUN pip install /workspace/install/python/triton*.whl
 
 WORKDIR /workspace/bert
 COPY . .
 
@@ -729,9 +729,9 @@ Note: Time to train includes upto 16 minutes of start up time for every restart
 
 Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
 
-| **GPUs** | **Batch size / GPU** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
+| **GPUs** | **Batch size / GPU: TF32, FP16 ** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
 |:---:|:----:|:----:|:---:|:----:|:----:|
-| 8 | 24 |91.41 |91.52 |0.26|0.26|
+| 8 | 16, 24 |91.41 |91.52 |0.26|0.26|
 
 ###### Fine-tuning accuracy for GLUE MRPC: NVIDIA DGX A100 (8x A100 40G)
 
 
@@ -0,0 +1,169 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import csv
+import zipfile
+import argparse
+import re
+
+class ChemProtTextFormatting:
+    """A basic formatter to preprocess the chemprot dataset.
+    """
+
+    def __init__(self, input_folder, output_folder):
+
+        chemprot_folder = input_folder
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "ChemProt_Corpus.zip"), "r") as zip:
+            zip.extractall(chemprot_folder)
+
+        chemprot_folder = os.path.join(input_folder, "ChemProt_Corpus")
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_development.zip")) as zip:
+            zip.extractall(chemprot_folder)
+
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+
+        self.format(os.path.join(chemprot_folder, "chemprot_development"),
+                    "chemprot_development_entities.tsv", "chemprot_development_relations.tsv",
+                    "chemprot_development_abstracts.tsv", os.path.join(output_folder, "dev.tsv"))
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_test_gs.zip")) as zip:
+            zip.extractall(chemprot_folder)
+        self.format(os.path.join(chemprot_folder, "chemprot_test_gs"),
+                    "chemprot_test_entities_gs.tsv", "chemprot_test_relations_gs.tsv",
+                    "chemprot_test_abstracts_gs.tsv", os.path.join(output_folder, "test.tsv"))
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_training.zip")) as zip:
+            zip.extractall(chemprot_folder)
+        self.format(os.path.join(chemprot_folder, "chemprot_training"),
+                    "chemprot_training_entities.tsv", "chemprot_training_relations.tsv",
+                    "chemprot_training_abstracts.tsv", os.path.join(output_folder, "train.tsv"))
+
+
+
+    def format(self, chemprot_path, entity_filename, relations_filename, abstracts_filename, output_filename):
+        """
+        Constructs ChemProt dataset for Relation Extraction.
+
+        Args:
+          chemprot_path: Path to files
+          entity_filename: Contains labelled mention annotations of chemical compounds and genes/proteins.
+                            <PMID> <EntityNumber> <Type of Entity> <Start Character offset> <End Character Offset> <Text String>
+          relations_filename: Contains a subset of chemical-protein relations annotations for the Chemprot dataset
+                            <PMID> <CPR Group> <EntityNumber1> <EntityNumber2>
+          abstracts_filename: Contains plain text CHEMPROT PubMed Data
+                            <PMID> <Title of the Article> <Abstract of the Article>
+          output_filename: Path to output file that will contain preprocessed data
+                            <PMID.EntityNumber1.EntityNumber2> <Preprocessed Sentence> <CPR Group>
+        """
+
+        data = {}
+        train_entities = csv.reader(open(os.path.join(chemprot_path, entity_filename),
+                                         mode="r"), delimiter="\t")
+        for entity in train_entities:
+            id = entity[0]
+            if data.get(id, None) is None:
+                data[id] = {"relations":{}, "entities":{"CHEMICAL":{}, "GENE":{}}}
+            data[id]["entities"]["CHEMICAL" if entity[2] == "CHEMICAL" else "GENE"][entity[1]] = (int(entity[3]), int(entity[4]), entity[2])
+
+        train_relations=csv.reader(open(os.path.join(chemprot_path, relations_filename),
+                                   mode="r"), delimiter="\t")
+        for relation in train_relations:
+            try:
+                id = relation[0]
+                data[id]["relations"][(relation[4].split("Arg1:")[-1], relation[5].split("Arg2:")[-1])] = relation[1] if relation[2] == "Y " else "false"
+            except:
+                print("invalid id")
+                raise ValueError
+        # print(data[list(data.keys())[0]])
+
+        with open(output_filename, 'w') as ofile:
+            train_abstracts = csv.reader(open(os.path.join(chemprot_path, abstracts_filename),
+                                              mode="r"), delimiter="\t")
+            owriter = csv.writer(ofile, delimiter='\t', lineterminator=os.linesep)
+            owriter.writerow(["index", "sentence", "label"])
+
+            num_sentences = 0
+            rejected = 0
+            for abstract in train_abstracts:
+                id = abstract[0]
+                line = abstract[1] + "\n" + abstract[2]
+
+                for tag1 in data[id]["entities"]["CHEMICAL"].keys():
+                    for tag2 in data[id]["entities"]["GENE"].keys():
+                        tag1_details = data[id]["entities"]["CHEMICAL"][tag1]
+                        tag2_details = data[id]["entities"]["GENE"][tag2]
+                        if ((tag1_details[0] <= tag2_details[0] and tag2_details[0] <= tag1_details[1]) # x1 <= y1 <= x2
+                            or (tag1_details[0] <= tag2_details[1] and tag2_details[0] <= tag1_details[1])): # x1 <= y2 <= x2
+                            continue
+
+                        relation = data[id]["relations"].get((tag2, tag1), None)
+                        relation = data[id]["relations"].get((tag1, tag2), None) if relation is None else relation
+                        if relation is None:
+                            relation = "false"
+
+                        start = 0
+                        line_protected = re.sub(r"(.)\.(?=[\d])", r"\1[PROTECTED_DOT]", line)
+                        for sentence in re.split(r'\.|\?', line_protected):
+                            sentence = sentence.replace("[PROTECTED_DOT]", ".")
+                            original_sentence = sentence
+                            end = start + len(sentence)
+
+                            if (tag1_details[0] >= start and tag1_details[1] <= end) and \
+                                    (tag2_details[0] >= start and tag2_details[1] <= end):
+                                for offset_start, offset_end, value in sorted(list(data[id]["entities"]["CHEMICAL"].values()) + list(data[id]["entities"]["GENE"].values()),
+                                                         reverse=True):
+                                    if (offset_start, offset_end) == (tag1_details[0], tag1_details[1]) or (offset_start, offset_end) == (tag2_details[0], tag2_details[1]):
+                                        if sentence[offset_start - start] == "@":
+                                            offset_end = start + sentence.find('$',offset_start - start) + 1
+                                        word = value
+                                    elif offset_start < start or offset_end > end or sentence[offset_start - start] == "@":
+                                        continue
+                                    else:
+                                        word = "OTHER"
+                                    sentence = sentence[:offset_start-start] + "@" + word + "$" + sentence[offset_end-start:]
+                                sentence = sentence.strip()
+                                owriter.writerow([id+"."+tag1+"."+tag2, sentence, relation])
+                                num_sentences += 1
+                                if id == "23538201" and start == 1048:
+                                    print("Accepted", tag1, tag2)
+
+                            else:
+                                rejected += 1
+
+                            start = end + 1
+            print("Succesfully written {} samples to {}".format(num_sentences, output_filename))
+            print("Rejected are", rejected)
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for ChemProt'
+    )
+
+    parser.add_argument(
+        '--input_folder',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+    parser.add_argument(
+        '--output_folder',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+
+    args = parser.parse_args()
+    preprocess_chemprot = ChemProtTextFormatting(args.input_folder, args.output_folder)
@@ -61,6 +61,8 @@ def download(self):
 
         elif self.dataset_name == 'CoLA':
             self.download_glue(self.dataset_name)
+        elif self.dataset_name == 'SST':
+            self.download_glue(self.dataset_name)
 
         elif self.dataset_name == 'squad':
             self.download_squad()
@@ -78,6 +80,7 @@ def download(self):
             self.download_glue("CoLA")
             self.download_glue("MNLI")
             self.download_glue("MRPC")
+            self.download_glue("SST")
             self.download_squad()
 
         else:
 
@@ -63,7 +63,7 @@ def main(args):
     elif args.action == 'text_formatting':
         assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
                and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
-               args.dataset != 'MNLI', 'Cannot perform text_formatting on pretrained weights'
+               args.dataset != 'MNLI' and args.dataset != 'SST', 'Cannot perform text_formatting on pretrained weights'
 
         if not os.path.exists(directory_structure['extracted']):
             os.makedirs(directory_structure['extracted'])
@@ -274,6 +274,7 @@ def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
             'MRPC',
             'CoLA',
             'MNLI',
+            'SST',
             'all'
         }
     )