asyml · jrxk · Oct 3, 2020 · Oct 5, 2020 · Oct 16, 2020 · Oct 16, 2020
diff --git a/examples/text_classification/README.md b/examples/text_classification/README.md
@@ -0,0 +1,64 @@
+## Unsupervised Data Augmentation for Text Classification
+
+Unsupervised Data Augmentation or UDA is a semi-supervised learning method which achieves state-of-the-art results on a wide variety of language and vision tasks. For details, please refer to the [paper](https://arxiv.org/abs/1904.12848) and the [official repository](https://github.com/google-research/uda).
+
+In this example, we demonstrate Forte's implementation of UDA using a simple BERT-based text classifier.
+
+## Quick Start
+
+### Install the dependencies
+
+You need to install [texar-pytorch](https://github.com/asyml/texar-pytorch) first.
+
+### Get the IMDB data
+
+We use the IMDB Text Classification dataset for this example. Use the following script to download the supervised and unsupervised training data.
+
+ ```bash
+python download_imdb.py
+```
+
+### Preproces and generate augmented data
+
+You can use the following script to process the data into CSV format.
+
+ ```bash
+python utils/imdb_format.py --raw_data_dir=data/IMDB_raw/aclImdb --train_id_path=data/IMDB_raw/train_id_list.txt --output_dir=data/IMDB
+```
+
+The next step is to generate augment training data (using your favorite back translation model) and output to a TXT file. Each example in the file should correspond to the same line in `train.csv` (without headers).
+
+For demonstration purpose, we provide the processed and augmented [data files](https://drive.google.com/file/d/1OKrbS76mbGCIz3FcFQ8-qPpMTQkQy8bP/view?usp=sharing). Place the CSV and txt files in directory `data/IMDB`.
+
+### Train
+
+To train the baseline model without UDA:
+
+ ```bash
+python main.py --do-train --do-eval --do-test
+```
+
+To train with UDA:
+
+ ```bash
+python main.py --do-train --do-eval --do-test --use-uda
+```
+
+To change the hyperparameters, please see `config_data.py`. You can also change the number of labeled examples used for training (`num_train_data`).
+
+#### GPU Memory Issue:
+
+According to the authors' [guideline for hyperparameters](https://github.com/google-research/uda#general-guidelines-for-setting-hyperparameters), longer sequence length and larger batch size lead to better performances. The sequence length and batch size are limited by the GPU memory. By default, we use `max_seq_length=128` and `batch_size=24` to run on a GTX1080Ti with 11GB memory.
+
+## Results
+
+With the provided data, you should be able to achieve performance similar to the following:
+
+| Number of Labeled Examples | BERT Accuracy | BERT+UDA Accuracy|
+| -------------------------- | ------------- | ------------------ |
+| 24                         | 61.54         | 84.92              |
+| 25000                      | 89.68         | 90.19              |
+
+When training with 24 examples, we use the Training Signal Annealing technique which can be turned on by setting `tsa=True`.
+
+You can further improve the performance by tuning hyperparameters, generate better back-translation data, using a larger BERT model, using a larger `max_seq_length` etc.
diff --git a/examples/text_classification/__init__.py b/examples/text_classification/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/text_classification/config_classifier.py b/examples/text_classification/config_classifier.py
@@ -0,0 +1,11 @@
+name = "bert_classifier"
+hidden_size = 768
+clas_strategy = "cls_time"
+dropout = 0.1
+num_classes = 2
+
+# This hyperparams is used in bert_with_hypertuning_main.py example
+hyperparams = {
+    "optimizer.warmup_steps": {"start": 10000, "end": 20000, "dtype": int},
+    "optimizer.static_lr": {"start": 1e-3, "end": 1e-2, "dtype": float}
+}
diff --git a/examples/text_classification/config_data.py b/examples/text_classification/config_data.py
@@ -0,0 +1,77 @@
+pickle_data_dir = "data/IMDB"
+unsup_bt_file = "data/IMDB/para_0.txt"
+max_seq_length = 128
+num_classes = 2
+num_train_data = 24  # supervised data limit. max 25000
+
+train_batch_size = 24
+max_train_epoch = 3000
+display_steps = 50  # Print training loss every display_steps; -1 to disable
+
+eval_steps = 100  # Eval on the dev set every eval_steps; if -1 will eval every epoch
+# Proportion of training to perform linear learning rate warmup for.
+# E.g., 0.1 = 10% of training.
+warmup_proportion = 0.1
+eval_batch_size = 8
+test_batch_size = 8
+
+feature_types = {
+    # Reading features from pickled data file.
+    # E.g., Reading feature "input_ids" as dtype `int64`;
+    # "FixedLenFeature" indicates its length is fixed for all data instances;
+    # and the sequence length is limited by `max_seq_length`.
+    "input_ids": ["int64", "stacked_tensor", max_seq_length],
+    "input_mask": ["int64", "stacked_tensor", max_seq_length],
+    "segment_ids": ["int64", "stacked_tensor", max_seq_length],
+    "label_ids": ["int64", "stacked_tensor"]
+}
+
+train_hparam = {
+    "allow_smaller_final_batch": False,
+    "batch_size": train_batch_size,
+    "dataset": {
+        "data_name": "data",
+        "feature_types": feature_types,
+        "files": "{}/train.pkl".format(pickle_data_dir)
+    },
+    "shuffle": True,
+    "shuffle_buffer_size": None
+}
+
+eval_hparam = {
+    "allow_smaller_final_batch": True,
+    "batch_size": eval_batch_size,
+    "dataset": {
+        "data_name": "data",
+        "feature_types": feature_types,
+        "files": "{}/eval.pkl".format(pickle_data_dir)
+    },
+    "shuffle": False
+}
+
+# UDA config
+tsa = True
+tsa_schedule = "linear_schedule" # linear_schedule, exp_schedule, log_schedule
+
+unsup_feature_types = {
+    "input_ids": ["int64", "stacked_tensor", max_seq_length],
+    "input_mask": ["int64", "stacked_tensor", max_seq_length],
+    "segment_ids": ["int64", "stacked_tensor", max_seq_length],
+    "label_ids": ["int64", "stacked_tensor"],
+    "aug_input_ids": ["int64", "stacked_tensor", max_seq_length],
+    "aug_input_mask": ["int64", "stacked_tensor", max_seq_length],
+    "aug_segment_ids": ["int64", "stacked_tensor", max_seq_length],
+    "aug_label_ids": ["int64", "stacked_tensor"]
+}
+
+unsup_hparam = {
+    "allow_smaller_final_batch": True,
+    "batch_size": train_batch_size,
+    "dataset": {
+        "data_name": "data",
+        "feature_types": unsup_feature_types,
+        "files": "{}/unsup.pkl".format(pickle_data_dir)
+    },
+    "shuffle": True,
+    "shuffle_buffer_size": None,
+}
diff --git a/examples/text_classification/download_imdb.py b/examples/text_classification/download_imdb.py
@@ -0,0 +1,37 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import sys
+import subprocess
+
+
+def main():
+    if not os.path.exists("data/IMDB_raw"):
+        subprocess.run("mkdir data/IMDB_raw", shell=True, check=True)
+    # pylint: disable=line-too-long
+    subprocess.run(
+        'wget -P data/IMDB_raw/ https://github.com/google-research/uda/blob/master/text/data/IMDB_raw/train_id_list.txt',
+        shell=True, check=True)
+    subprocess.run(
+        'wget -P data/IMDB_raw/ https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
+        shell=True, check=True)
+    subprocess.run(
+        'tar xzvf data/IMDB_raw/aclImdb_v1.tar.gz -C data/IMDB_raw/ && rm data/IMDB_raw/aclImdb_v1.tar.gz',
+        shell=True, check=True)
+
+
+if __name__ == '__main__':
+    sys.exit(main())