Skip to content

Commit b82c372

Browse files
author
Swetha Mandava
committed
triton v2 api, download mrpc fix, update for mpi 4.2
1 parent 769843e commit b82c372

17 files changed

+581
-48
lines changed

TensorFlow/LanguageModeling/BERT/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ RUN git clone https://github.com/titipata/pubmed_parser
1717
RUN pip3 install /workspace/pubmed_parser
1818

1919
#Copy the perf_client over
20-
ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.14.0/v1.14.0_ubuntu1804.clients.tar.gz
20+
ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.0.0/v2.0.0_ubuntu1804.clients.tar.gz
2121
RUN mkdir -p /workspace/install \
2222
&& curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
2323

2424
#Install the python wheel with pip
25-
RUN pip install /workspace/install/python/tensorrtserver-1.14.0-py3-none-linux_x86_64.whl
25+
RUN pip install /workspace/install/python/triton*.whl
2626

2727
WORKDIR /workspace/bert
2828
COPY . .

TensorFlow/LanguageModeling/BERT/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,9 +729,9 @@ Note: Time to train includes upto 16 minutes of start up time for every restart
729729

730730
Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
731731

732-
| **GPUs** | **Batch size / GPU** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
732+
| **GPUs** | **Batch size / GPU: TF32, FP16 ** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
733733
|:---:|:----:|:----:|:---:|:----:|:----:|
734-
| 8 | 24 |91.41 |91.52 |0.26|0.26|
734+
| 8 | 16, 24 |91.41 |91.52 |0.26|0.26|
735735

736736
###### Fine-tuning accuracy for GLUE MRPC: NVIDIA DGX A100 (8x A100 40G)
737737

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
import os
15+
import csv
16+
import zipfile
17+
import argparse
18+
import re
19+
20+
class ChemProtTextFormatting:
21+
"""A basic formatter to preprocess the chemprot dataset.
22+
"""
23+
24+
def __init__(self, input_folder, output_folder):
25+
26+
chemprot_folder = input_folder
27+
with zipfile.ZipFile(os.path.join(chemprot_folder, "ChemProt_Corpus.zip"), "r") as zip:
28+
zip.extractall(chemprot_folder)
29+
30+
chemprot_folder = os.path.join(input_folder, "ChemProt_Corpus")
31+
32+
with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_development.zip")) as zip:
33+
zip.extractall(chemprot_folder)
34+
35+
if not os.path.exists(output_folder):
36+
os.makedirs(output_folder)
37+
38+
self.format(os.path.join(chemprot_folder, "chemprot_development"),
39+
"chemprot_development_entities.tsv", "chemprot_development_relations.tsv",
40+
"chemprot_development_abstracts.tsv", os.path.join(output_folder, "dev.tsv"))
41+
42+
with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_test_gs.zip")) as zip:
43+
zip.extractall(chemprot_folder)
44+
self.format(os.path.join(chemprot_folder, "chemprot_test_gs"),
45+
"chemprot_test_entities_gs.tsv", "chemprot_test_relations_gs.tsv",
46+
"chemprot_test_abstracts_gs.tsv", os.path.join(output_folder, "test.tsv"))
47+
48+
with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_training.zip")) as zip:
49+
zip.extractall(chemprot_folder)
50+
self.format(os.path.join(chemprot_folder, "chemprot_training"),
51+
"chemprot_training_entities.tsv", "chemprot_training_relations.tsv",
52+
"chemprot_training_abstracts.tsv", os.path.join(output_folder, "train.tsv"))
53+
54+
55+
56+
def format(self, chemprot_path, entity_filename, relations_filename, abstracts_filename, output_filename):
57+
"""
58+
Constructs ChemProt dataset for Relation Extraction.
59+
60+
Args:
61+
chemprot_path: Path to files
62+
entity_filename: Contains labelled mention annotations of chemical compounds and genes/proteins.
63+
<PMID> <EntityNumber> <Type of Entity> <Start Character offset> <End Character Offset> <Text String>
64+
relations_filename: Contains a subset of chemical-protein relations annotations for the Chemprot dataset
65+
<PMID> <CPR Group> <EntityNumber1> <EntityNumber2>
66+
abstracts_filename: Contains plain text CHEMPROT PubMed Data
67+
<PMID> <Title of the Article> <Abstract of the Article>
68+
output_filename: Path to output file that will contain preprocessed data
69+
<PMID.EntityNumber1.EntityNumber2> <Preprocessed Sentence> <CPR Group>
70+
"""
71+
72+
data = {}
73+
train_entities = csv.reader(open(os.path.join(chemprot_path, entity_filename),
74+
mode="r"), delimiter="\t")
75+
for entity in train_entities:
76+
id = entity[0]
77+
if data.get(id, None) is None:
78+
data[id] = {"relations":{}, "entities":{"CHEMICAL":{}, "GENE":{}}}
79+
data[id]["entities"]["CHEMICAL" if entity[2] == "CHEMICAL" else "GENE"][entity[1]] = (int(entity[3]), int(entity[4]), entity[2])
80+
81+
train_relations=csv.reader(open(os.path.join(chemprot_path, relations_filename),
82+
mode="r"), delimiter="\t")
83+
for relation in train_relations:
84+
try:
85+
id = relation[0]
86+
data[id]["relations"][(relation[4].split("Arg1:")[-1], relation[5].split("Arg2:")[-1])] = relation[1] if relation[2] == "Y " else "false"
87+
except:
88+
print("invalid id")
89+
raise ValueError
90+
# print(data[list(data.keys())[0]])
91+
92+
with open(output_filename, 'w') as ofile:
93+
train_abstracts = csv.reader(open(os.path.join(chemprot_path, abstracts_filename),
94+
mode="r"), delimiter="\t")
95+
owriter = csv.writer(ofile, delimiter='\t', lineterminator=os.linesep)
96+
owriter.writerow(["index", "sentence", "label"])
97+
98+
num_sentences = 0
99+
rejected = 0
100+
for abstract in train_abstracts:
101+
id = abstract[0]
102+
line = abstract[1] + "\n" + abstract[2]
103+
104+
for tag1 in data[id]["entities"]["CHEMICAL"].keys():
105+
for tag2 in data[id]["entities"]["GENE"].keys():
106+
tag1_details = data[id]["entities"]["CHEMICAL"][tag1]
107+
tag2_details = data[id]["entities"]["GENE"][tag2]
108+
if ((tag1_details[0] <= tag2_details[0] and tag2_details[0] <= tag1_details[1]) # x1 <= y1 <= x2
109+
or (tag1_details[0] <= tag2_details[1] and tag2_details[0] <= tag1_details[1])): # x1 <= y2 <= x2
110+
continue
111+
112+
relation = data[id]["relations"].get((tag2, tag1), None)
113+
relation = data[id]["relations"].get((tag1, tag2), None) if relation is None else relation
114+
if relation is None:
115+
relation = "false"
116+
117+
start = 0
118+
line_protected = re.sub(r"(.)\.(?=[\d])", r"\1[PROTECTED_DOT]", line)
119+
for sentence in re.split(r'\.|\?', line_protected):
120+
sentence = sentence.replace("[PROTECTED_DOT]", ".")
121+
original_sentence = sentence
122+
end = start + len(sentence)
123+
124+
if (tag1_details[0] >= start and tag1_details[1] <= end) and \
125+
(tag2_details[0] >= start and tag2_details[1] <= end):
126+
for offset_start, offset_end, value in sorted(list(data[id]["entities"]["CHEMICAL"].values()) + list(data[id]["entities"]["GENE"].values()),
127+
reverse=True):
128+
if (offset_start, offset_end) == (tag1_details[0], tag1_details[1]) or (offset_start, offset_end) == (tag2_details[0], tag2_details[1]):
129+
if sentence[offset_start - start] == "@":
130+
offset_end = start + sentence.find('$',offset_start - start) + 1
131+
word = value
132+
elif offset_start < start or offset_end > end or sentence[offset_start - start] == "@":
133+
continue
134+
else:
135+
word = "OTHER"
136+
sentence = sentence[:offset_start-start] + "@" + word + "$" + sentence[offset_end-start:]
137+
sentence = sentence.strip()
138+
owriter.writerow([id+"."+tag1+"."+tag2, sentence, relation])
139+
num_sentences += 1
140+
if id == "23538201" and start == 1048:
141+
print("Accepted", tag1, tag2)
142+
143+
else:
144+
rejected += 1
145+
146+
start = end + 1
147+
print("Succesfully written {} samples to {}".format(num_sentences, output_filename))
148+
print("Rejected are", rejected)
149+
150+
151+
if __name__=="__main__":
152+
parser = argparse.ArgumentParser(
153+
description='Preprocessing Application for ChemProt'
154+
)
155+
156+
parser.add_argument(
157+
'--input_folder',
158+
type=str,
159+
help='Specify the input files in a comma-separated list (no spaces)'
160+
)
161+
parser.add_argument(
162+
'--output_folder',
163+
type=str,
164+
help='Specify the input files in a comma-separated list (no spaces)'
165+
)
166+
167+
168+
args = parser.parse_args()
169+
preprocess_chemprot = ChemProtTextFormatting(args.input_folder, args.output_folder)

TensorFlow/LanguageModeling/BERT/data/Downloader.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ def download(self):
6161

6262
elif self.dataset_name == 'CoLA':
6363
self.download_glue(self.dataset_name)
64+
elif self.dataset_name == 'SST':
65+
self.download_glue(self.dataset_name)
6466

6567
elif self.dataset_name == 'squad':
6668
self.download_squad()
@@ -78,6 +80,7 @@ def download(self):
7880
self.download_glue("CoLA")
7981
self.download_glue("MNLI")
8082
self.download_glue("MRPC")
83+
self.download_glue("SST")
8184
self.download_squad()
8285

8386
else:

TensorFlow/LanguageModeling/BERT/data/bertPrep.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def main(args):
6363
elif args.action == 'text_formatting':
6464
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
6565
and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
66-
args.dataset != 'MNLI', 'Cannot perform text_formatting on pretrained weights'
66+
args.dataset != 'MNLI' and args.dataset != 'SST', 'Cannot perform text_formatting on pretrained weights'
6767

6868
if not os.path.exists(directory_structure['extracted']):
6969
os.makedirs(directory_structure['extracted'])
@@ -274,6 +274,7 @@ def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
274274
'MRPC',
275275
'CoLA',
276276
'MNLI',
277+
'SST',
277278
'all'
278279
}
279280
)

0 commit comments

Comments
 (0)