PaddlePaddle
diff --git a/‎docs/dataaug.md‎
Lines changed: 907 additions & 273 deletions b/‎docs/dataaug.md‎
Lines changed: 907 additions & 273 deletions
diff --git a/‎paddlenlp/dataaug/__init__.py‎
Lines changed: 5 additions & 5 deletions b/‎paddlenlp/dataaug/__init__.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎paddlenlp/dataaug/base_augment.py‎
Lines changed: 87 additions & 9 deletions b/‎paddlenlp/dataaug/base_augment.py‎
Lines changed: 87 additions & 9 deletions
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .word_substitute import *
-from .word_insert import *
-from .word_delete import *
-from .word_swap import *
+from .base_augment import FileAugment
+from .char import *
+from .sentence import *
+from .word import *
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import os
 import re
-import math
-import random
 from typing import Iterable
 
-import numpy as np
-import paddle
 from paddle.dataset.common import md5file
 from paddle.utils.download import get_path_from_url
 
+from ..data import JiebaTokenizer, Vocab
 from ..utils.env import DATA_HOME
-from ..data import Vocab, JiebaTokenizer
 
 
 class BaseAugment(object):
@@ -44,7 +41,7 @@ class BaseAugment(object):
             Maximum number of augmented words in sequences.
     """
 
-    def __init__(self, create_n, aug_n=None, aug_percent=0.02, aug_min=1, aug_max=10):
+    def __init__(self, create_n=1, aug_n=None, aug_percent=0.1, aug_min=1, aug_max=10, vocab="vocab"):
         self._DATA = {
             "stop_words": (
                 "stopwords.txt",
@@ -56,24 +53,49 @@ def __init__(self, create_n, aug_n=None, aug_percent=0.02, aug_min=1, aug_max=10
                 "25c2d41aec5a6d328a65c1995d4e4c2e",
                 "https://bj.bcebos.com/paddlenlp/data/baidu_encyclopedia_w2v_vocab.json",
             ),
+            "test_vocab": (
+                "test_vocab.json",
+                "1d2fce1c80a4a0ec2e90a136f339ab88",
+                "https://bj.bcebos.com/paddlenlp/data/test_vocab.json",
+            ),
             "word_synonym": (
                 "word_synonym.json",
                 "aaa9f864b4af4123bce4bf138a5bfa0d",
                 "https://bj.bcebos.com/paddlenlp/data/word_synonym.json",
             ),
+            "word_embedding": (
+                "word_embedding.json",
+                "534aa4ad274def4deff585cefd8ead32",
+                "https://bj.bcebos.com/paddlenlp/data/word_embedding.json",
+            ),
             "word_homonym": (
                 "word_homonym.json",
                 "a578c04201a697e738f6a1ad555787d5",
                 "https://bj.bcebos.com/paddlenlp/data/word_homonym.json",
             ),
+            "char_homonym": (
+                "char_homonym.json",
+                "dd98d5d5d32a3d3dd45c8f7ca503c7df",
+                "https://bj.bcebos.com/paddlenlp/data/char_homonym.json",
+            ),
+            "char_antonym": (
+                "char_antonym.json",
+                "f892f5dce06f17d19949ebcbe0ed52b7",
+                "https://bj.bcebos.com/paddlenlp/data/char_antonym.json",
+            ),
+            "word_antonym": (
+                "word_antonym.json",
+                "cbea11fa99fbe9d07e8185750b37e84a",
+                "https://bj.bcebos.com/paddlenlp/data/word_antonym.json",
+            ),
         }
         self.stop_words = self._get_data("stop_words")
         self.aug_n = aug_n
         self.aug_percent = aug_percent
         self.aug_min = aug_min
         self.aug_max = aug_max
         self.create_n = create_n
-        self.vocab = Vocab.from_json(self._load_file("vocab"))
+        self.vocab = Vocab.from_json(self._load_file(vocab))
         self.tokenizer = JiebaTokenizer(self.vocab)
         self.loop = 5
 
@@ -150,7 +172,7 @@ def augment(self, sequences, num_thread=1):
         # Single Thread
         if num_thread == 1:
             if isinstance(sequences, str):
-                return self._augment(sequences)
+                return [self._augment(sequences)]
             else:
                 output = []
                 for sequence in sequences:
@@ -161,3 +183,59 @@ def augment(self, sequences, num_thread=1):
 
     def _augment(self, sequence):
         raise NotImplementedError
+
+
+class FileAugment(object):
+    """
+    File data augmentation
+
+    Args:
+        strategies (List):
+            List of augmentation strategies.
+    """
+
+    def __init__(self, strategies):
+        self.strategies = strategies
+
+    def augment(self, input_file, output_file="aug.txt", separator=None, separator_id=0):
+        output_sequences = []
+        sequences = []
+
+        input_sequences = self.file_read(input_file)
+
+        if separator:
+            for input_sequence in input_sequences:
+                sequences.append(input_sequence.split(separator)[separator_id])
+        else:
+            sequences = input_sequences
+
+        for strategy in self.strategies:
+            aug_sequences = strategy.augment(sequences)
+            if separator:
+                for aug_sequence, input_sequence in zip(aug_sequences, input_sequences):
+                    input_items = input_sequence.split(separator)
+                    for s in aug_sequence:
+                        input_items[separator_id] = s
+                        output_sequences.append(separator.join(input_items))
+            else:
+                for aug_sequence in aug_sequences:
+                    output_sequences += aug_sequence
+
+        if output_file:
+            self.file_write(output_sequences, output_file)
+
+        return output_sequences
+
+    def file_read(self, input_file):
+        input_sequences = []
+        with open(input_file, "r", encoding="utf-8") as f:
+            for line in f:
+                input_sequences.append(line.strip())
+        f.close()
+        return input_sequences
+
+    def file_write(self, output_sequences, output_file):
+        with open(output_file, "w", encoding="utf-8") as f:
+            for output_sequence in output_sequences:
+                f.write(output_sequence + "\n")
+        f.close()