converge to pyt

Swetha Mandava · Swetha Mandava · commit 1069a7358c67 · 2020-08-10T13:15:39.000-07:00
diff --git a/TensorFlow/LanguageModeling/BERT/Dockerfile b/TensorFlow/LanguageModeling/BERT/Dockerfile
@@ -5,7 +5,7 @@ FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl libb64-dev
 RUN pip install --upgrade pip
 RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
-RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger
+RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger wget
 
 WORKDIR /workspace
 RUN git clone https://github.com/openai/gradient-checkpointing.git
diff --git a/TensorFlow/LanguageModeling/BERT/data/Downloader.py b/TensorFlow/LanguageModeling/BERT/data/Downloader.py
@@ -53,15 +53,15 @@ def download(self):
         elif self.dataset_name == 'nvidia_pretrained_weights':
             self.download_nvidia_pretrained_weights()
 
-        elif self.dataset_name == 'MRPC':
+        elif self.dataset_name == 'mrpc':
             self.download_glue(self.dataset_name)
 
-        elif self.dataset_name == 'MNLI':
+        elif self.dataset_name == 'mnli':
             self.download_glue(self.dataset_name)
 
-        elif self.dataset_name == 'CoLA':
+        elif self.dataset_name == 'cola':
             self.download_glue(self.dataset_name)
-        elif self.dataset_name == 'SST':
+        elif self.dataset_name == 'sst-2':
             self.download_glue(self.dataset_name)
 
         elif self.dataset_name == 'squad':
@@ -77,10 +77,10 @@ def download(self):
             self.download_pubmed('open_access')
             self.download_google_pretrained_weights()
             self.download_nvidia_pretrained_weights()
-            self.download_glue("CoLA")
-            self.download_glue("MNLI")
-            self.download_glue("MRPC")
-            self.download_glue("SST")
+            self.download_glue("cola")
+            self.download_glue("mnli")
+            self.download_glue("mrpc")
+            self.download_glue("sst-2")
             self.download_squad()
 
         else:
@@ -114,8 +114,8 @@ def download_nvidia_pretrained_weights(self):
 
 
     def download_glue(self, glue_task_name):
-        downloader = GLUEDownloader(glue_task_name, self.save_path)
-        downloader.download()
+        downloader = GLUEDownloader(self.save_path)
+        downloader.download(glue_task_name)
 
 
     def download_squad(self):
diff --git a/TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py b/TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py
@@ -11,99 +11,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bz2
-import os
-import urllib
 import sys
-import zipfile
-import io
+import wget
 
-URLLIB=urllib
-if sys.version_info >= (3, 0):
-    URLLIB=urllib.request
+from pathlib import Path
 
-class GLUEDownloader:
-    def __init__(self, task, save_path):
-
-        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
-
-        self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-                     "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-                     "MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-                            "mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
-                            "mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
-                     "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-                     "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-                     "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-                     "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-                     "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
-                     "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-                     "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-                     "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
-
-
-        self.save_path = save_path
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.task = task
 
-    def download(self):
+def mkdir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
 
-        if self.task == 'MRPC':
-            self.download_mrpc()
-        elif self.task == 'diagnostic':
-            self.download_diagnostic()
-        else:
-            self.download_and_extract(self.task)
 
-    def download_and_extract(self, task):
-        print("Downloading and extracting %s..." % task)
-        data_file = "%s.zip" % task
-        URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
-        print(data_file,"\n\n\n")
-        with zipfile.ZipFile(data_file) as zip_ref:
-            zip_ref.extractall(self.save_path)
-        os.remove(data_file)
-        print("\tCompleted!")
-
-    def download_mrpc(self):
-        print("Processing MRPC...")
-        mrpc_dir = os.path.join(self.save_path, "MRPC")
-        if not os.path.isdir(mrpc_dir):
-            os.mkdir(mrpc_dir)
-
-        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
-        mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
-        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)
-
-        dev_ids = []
-        with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
-            for row in ids_fh:
-                dev_ids.append(row.strip().split('\t'))
-
-        with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
-                io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
-                io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
-            header = data_fh.readline()
-            train_fh.write(header)
-            dev_fh.write(header)
-            for row in data_fh:
-                label, id1, id2, s1, s2 = row.strip().split('\t')
-                if [id1, id2] in dev_ids:
-                    dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-                else:
-                    train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+class GLUEDownloader:
 
-        with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
-                io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
-            header = data_fh.readline()
-            test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
-            for idx, row in enumerate(data_fh):
-                label, id1, id2, s1, s2 = row.strip().split('\t')
-                test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
-        print("\tCompleted!")
+    def __init__(self, save_path):
+        self.save_path = save_path + '/glue'
+
+    def download(self, task_name):
+        mkdir(self.save_path)
+        if task_name in {'mrpc', 'mnli'}:
+            task_name = task_name.upper()
+        elif task_name == 'cola':
+            task_name = 'CoLA'
+        else:  # SST-2
+            assert task_name == 'sst-2'
+            task_name = 'SST'
+        wget.download(
+            'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
+            out=self.save_path,
+        )
+        sys.path.append(self.save_path)
+        import download_glue_data
+        download_glue_data.main(
+            ['--data_dir', self.save_path, '--tasks', task_name])
+        sys.path.pop()
diff --git a/TensorFlow/LanguageModeling/BERT/data/bertPrep.py b/TensorFlow/LanguageModeling/BERT/data/bertPrep.py
@@ -62,8 +62,8 @@ def main(args):
 
     elif args.action == 'text_formatting':
         assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
-               and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
-               args.dataset != 'MNLI' and args.dataset != 'SST', 'Cannot perform text_formatting on pretrained weights'
+               and args.dataset != 'squad' and args.dataset != 'mrpc' and args.dataset != 'cola' and \
+               args.dataset != 'mnli' and args.dataset != 'sst-2', 'Cannot perform text_formatting on pretrained weights'
 
         if not os.path.exists(directory_structure['extracted']):
             os.makedirs(directory_structure['extracted'])
@@ -271,10 +271,10 @@ def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
             'google_pretrained_weights',
             'nvidia_pretrained_weights',
             'squad',
-            'MRPC',
-            'CoLA',
-            'MNLI',
-            'SST',
+            'mrpc',
+            'sst-2',
+            'mnli',
+            'cola',
             'all'
         }
     )
diff --git a/TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh b/TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -25,8 +25,8 @@ fi
 python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
 python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
 python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
-python3 /workspace/bert/data/bertPrep.py --action download --dataset MRPC
-python3 /workspace/bert/data/bertPrep.py --action download --dataset SST
+python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
+python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
 
 # Properly format the text files
 if [ "$to_download" = "wiki_books" ] ; then