Merge pull request #23 from RTIInternational/backtranslation

jasonnance · web-flow · commit 62a86ce29103 · 2020-05-26T16:16:46.000-04:00
Add a data augmentation method based on backtranslation along with a benchmark.
diff --git a/benchmark/BENCHMARK_SPECS.yml b/benchmark/BENCHMARK_SPECS.yml
@@ -644,6 +644,13 @@
       params: {}
     - augment_name: "BERTMaskedLM"
       params: {}
+    - augment_name: "MarianMT"
+      params:
+        # Need as many languages as the largest multiplier used above
+        # Top x available in MarianMT by descending popularity on Wikipedia as a
+        # rough proxy for best-supported languages
+        # https://en.wikipedia.org/wiki/List_of_Wikipedias
+        target_languages: ["french", "german", "japanese", "russian", "italian", "portugese", "dutch", "indonesian", "ukrainian", "swedish"]
 
 - scenario: "document_windowing"
   params:
diff --git a/benchmark/benchmark_output/data_augmentation/MarianMT/output.md b/benchmark/benchmark_output/data_augmentation/MarianMT/output.md
@@ -0,0 +1,20 @@
+# Results: MarianMT
+|    |   percent |   multiplier |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
+|---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:|
+|  0 |     0.005 |            0 |            0.623019 |                   0.651554 |                 0.63396 |    0.63396 |
+|  1 |     0.005 |            1 |            0.333422 |                   0.75001  |                 0.50004 |    0.50004 |
+|  2 |     0.005 |            5 |            0.658066 |                   0.658107 |                 0.65808 |    0.65808 |
+|  3 |     0.005 |           10 |            0.646764 |                   0.67772  |                 0.65704 |    0.65704 |
+|  4 |     0.05  |            0 |            0.798386 |                   0.798483 |                 0.7984  |    0.7984  |
+|  5 |     0.05  |            1 |            0.794299 |                   0.794979 |                 0.7944  |    0.7944  |
+|  6 |     0.05  |            5 |            0.808468 |                   0.808556 |                 0.80848 |    0.80848 |
+|  7 |     0.05  |           10 |            0.807779 |                   0.807932 |                 0.8078  |    0.8078  |
+|  8 |     0.33  |            0 |            0.85779  |                   0.857904 |                 0.8578  |    0.8578  |
+|  9 |     0.33  |            1 |            0.853155 |                   0.85363  |                 0.8532  |    0.8532  |
+| 10 |     0.33  |            5 |            0.856863 |                   0.857054 |                 0.85688 |    0.85688 |
+| 11 |     0.75  |            0 |            0.876512 |                   0.876623 |                 0.87652 |    0.87652 |
+| 12 |     0.75  |            1 |            0.871752 |                   0.872361 |                 0.8718  |    0.8718  |
+| 13 |     0.75  |            5 |            0.871178 |                   0.871456 |                 0.8712  |    0.8712  |
+
+![Results](MarianMT/plot.png)
+---
diff --git a/benchmark/benchmark_output/data_augmentation/MarianMT/plot.png b/benchmark/benchmark_output/data_augmentation/MarianMT/plot.png
diff --git a/benchmark/benchmark_output/data_augmentation/MarianMT/run-meta.json b/benchmark/benchmark_output/data_augmentation/MarianMT/run-meta.json
@@ -0,0 +1 @@
+{"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "MarianMT", "params": {"target_languages": ["french", "german", "japanese", "russian", "italian", "portugese", "dutch", "indonesian", "ukrainian", "swedish"]}}
diff --git a/benchmark/benchmark_output/data_augmentation/data_augmentation.md b/benchmark/benchmark_output/data_augmentation/data_augmentation.md
@@ -58,3 +58,23 @@
 
 ![Results](BERTMaskedLM/plot.png)
 ---
+# Results: MarianMT
+|    |   percent |   multiplier |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
+|---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:|
+|  0 |     0.005 |            0 |            0.623019 |                   0.651554 |                 0.63396 |    0.63396 |
+|  1 |     0.005 |            1 |            0.333422 |                   0.75001  |                 0.50004 |    0.50004 |
+|  2 |     0.005 |            5 |            0.658066 |                   0.658107 |                 0.65808 |    0.65808 |
+|  3 |     0.005 |           10 |            0.646764 |                   0.67772  |                 0.65704 |    0.65704 |
+|  4 |     0.05  |            0 |            0.798386 |                   0.798483 |                 0.7984  |    0.7984  |
+|  5 |     0.05  |            1 |            0.794299 |                   0.794979 |                 0.7944  |    0.7944  |
+|  6 |     0.05  |            5 |            0.808468 |                   0.808556 |                 0.80848 |    0.80848 |
+|  7 |     0.05  |           10 |            0.807779 |                   0.807932 |                 0.8078  |    0.8078  |
+|  8 |     0.33  |            0 |            0.85779  |                   0.857904 |                 0.8578  |    0.8578  |
+|  9 |     0.33  |            1 |            0.853155 |                   0.85363  |                 0.8532  |    0.8532  |
+| 10 |     0.33  |            5 |            0.856863 |                   0.857054 |                 0.85688 |    0.85688 |
+| 11 |     0.75  |            0 |            0.876512 |                   0.876623 |                 0.87652 |    0.87652 |
+| 12 |     0.75  |            1 |            0.871752 |                   0.872361 |                 0.8718  |    0.8718  |
+| 13 |     0.75  |            5 |            0.871178 |                   0.871456 |                 0.8712  |    0.8712  |
+
+![Results](MarianMT/plot.png)
+---
diff --git a/benchmark/run_benchmarks.py b/benchmark/run_benchmarks.py
@@ -1,11 +1,13 @@
 import logging
+import os
 from pathlib import Path
 from typing import Any, Dict, List
 
 import click
 import yaml
 
 import gobbli
+from benchmark_util import BENCHMARK_DATA_DIR
 from scenario import (
     ClassImbalanceScenario,
     DataAugmentationScenario,
@@ -104,6 +106,10 @@ def run(
     debug: bool,
     raise_exceptions: bool,
 ):
+    # Make sure all models run outside of experiments create their data under the
+    # assigned benchmark directory
+    os.environ["GOBBLI_DIR"] = str(BENCHMARK_DATA_DIR)
+
     logging.basicConfig(
         level=log_level, format="[%(asctime)s] %(levelname)s - %(name)s: %(message)s"
     )
diff --git a/benchmark/scenario.py b/benchmark/scenario.py
@@ -19,6 +19,7 @@
 
 import gobbli.model
 from benchmark_util import (
+    BENCHMARK_DATA_DIR,
     PREPROCESS_FUNCS,
     StdoutCatcher,
     assert_param_required,
@@ -41,11 +42,27 @@
     make_document_windows,
     pool_document_windows,
 )
+from gobbli.model.base import BaseModel
 from gobbli.util import TokenizeMethod, assert_in, assert_type, pred_prob_to_pred_label
 
 LOGGER = logging.getLogger(__name__)
 
 
+def get_model_run_params() -> Dict[str, Any]:
+    """
+    See also :func:`run_benchmark_experiment`, since there's some duplication between there
+    and here (that function initializes its own models indirectly via
+    :class:`gobbli.experiment.ClassificationExperiment`
+
+    Returns:
+      Parameters that should be passed to any gobbli model used as part of benchmarks.
+    """
+    return {
+        "use_gpu": os.getenv("GOBBLI_USE_GPU") is not None,
+        "nvidia_visible_devices": os.getenv("NVIDIA_VISIBLE_DEVICES", ""),
+    }
+
+
 class BaseRun(ABC):
     """
     Base class for a single run within a benchmark scenario.
@@ -426,13 +443,10 @@ def _do_run(self, run: ModelEmbeddingRun, run_output_dir: Path) -> str:
         stdout_catcher = StdoutCatcher()
         with stdout_catcher:
             # Construct the dict of kwargs up-front so each run can override the "use_gpu"
-            # option if necessary -- ex. for models like spaCy which have trouble controlling
-            # memory usage on the GPU and don't gain much benefit from it
-            model_kwargs = {
-                "use_gpu": os.getenv("GOBBLI_USE_GPU") is not None,
-                "nvidia_visible_devices": os.getenv("NVIDIA_VISIBLE_DEVICES", ""),
-                **run.model_params,
-            }
+            # option if necessary using its model params -- ex. for models like spaCy
+            # which have trouble controlling memory usage on the GPU and don't gain
+            # much benefit from it
+            model_kwargs = {**get_model_run_params(), **run.model_params}
             model = model_cls(**model_kwargs)
             model.build()
 
@@ -778,7 +792,20 @@ def _do_run(self, run: AugmentRun, run_output_dir: Path) -> str:
 
         assert_valid_augment(run.augment_name)
         augment_cls = getattr(gobbli.augment, run.augment_name)
-        augment_obj = augment_cls(**run.params)
+
+        model_run_params: Dict[str, Any] = {}
+        if issubclass(augment_cls, BaseModel):
+            # If the augment method is also a gobbli model (and will be mounting files back-
+            # and-forth with Docker), we need to make sure it has the proper params
+            # applied ex. to store data in the correct place and use GPU(s)
+            model_run_params = get_model_run_params()
+
+        augment_obj = augment_cls(**run.params, **model_run_params)
+
+        # Some augmentation methods are also models, which need to be built
+        # beforehand
+        if isinstance(augment_obj, BaseModel):
+            augment_obj.build()
 
         all_results = []
 
diff --git a/gobbli/augment/__init__.py b/gobbli/augment/__init__.py
@@ -1,5 +1,6 @@
 from gobbli.augment.bert import BERTMaskedLM
+from gobbli.augment.marian import MarianMT
 from gobbli.augment.word2vec import Word2Vec
 from gobbli.augment.wordnet import WordNet
 
-__all__ = ["BERTMaskedLM", "Word2Vec", "WordNet"]
+__all__ = ["BERTMaskedLM", "Word2Vec", "WordNet", "MarianMT"]
diff --git a/gobbli/augment/marian/Dockerfile b/gobbli/augment/marian/Dockerfile
@@ -0,0 +1,6 @@
+FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
+
+RUN pip install transformers==2.9.1
+
+COPY ./src /code/marian
+WORKDIR /code/marian
diff --git a/gobbli/augment/marian/__init__.py b/gobbli/augment/marian/__init__.py
@@ -0,0 +1,3 @@
+from .model import MarianMT
+
+__all__ = ["MarianMT"]
diff --git a/gobbli/augment/marian/model.py b/gobbli/augment/marian/model.py
diff --git a/gobbli/augment/marian/src/backtranslate_text.py b/gobbli/augment/marian/src/backtranslate_text.py
diff --git a/gobbli/test/augment/test_marian.py b/gobbli/test/augment/test_marian.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "MarianMT", "params": {"target_languages": ["french", "german", "japanese", "russian", "italian", "portugese", "dutch", "indonesian", "ukrainian", "swedish"]}}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .model import MarianMT`
	`2`	`+`
	`3`	`+__all__ = ["MarianMT"]`