-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheasy_dat_aug.py
More file actions
122 lines (92 loc) · 3.63 KB
/
easy_dat_aug.py
File metadata and controls
122 lines (92 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
##################################
# Original Code at: https://github.com/catSirup/KorEDA
# removed some augmentation techniques that is unnecessary for our project
##################################
import random
# a word in the list below must be included at the result texts
_important_keywords = ['웃', '배꼽', '재치'
'감동', '눈물',
'스토리', '내용',
'몰입', '빠져', '집중',
'무대', '효과',
'노래', '넘버',
'춤', '댄스', '안무'
'연기', '호연', '열연']
########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
if len(words) == 1:
return words
new_words = []
for word in words:
r = random.uniform(0, 1)
important = False
for keywords in _important_keywords:
if word in keywords or keywords in word:
important = True
if important:
new_words.append(word)
continue
if r > p:
new_words.append(word)
if len(new_words) == 0:
rand_int = random.randint(0, len(words) - 1)
return [words[rand_int]]
return new_words
########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
new_words = words.copy()
for _ in range(n):
new_words = swap_word(new_words)
return new_words
def swap_word(new_words):
random_idx_1 = random.randint(0, len(new_words) - 1)
random_idx_2 = random_idx_1
counter = 0
while random_idx_2 == random_idx_1:
random_idx_2 = random.randint(0, len(new_words) - 1)
counter += 1
if counter > 3:
return new_words
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
return new_words
def EDA(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.2, p_rd=0.2, num_aug=6):
words = sentence.split(' ')
words = [word for word in words if word != ""]
num_words = len(words)
augmented_sentences = []
num_new_per_technique = int(num_aug / 4) + 1
n_sr = max(1, int(alpha_sr * num_words))
n_ri = max(1, int(alpha_ri * num_words))
n_rs = max(1, int(alpha_rs * num_words))
# rs
for _ in range(num_new_per_technique):
a_words = random_swap(words, n_rs)
if len(a_words) == 0:
continue
to_append = " ".join(a_words)
if to_append not in augmented_sentences:
augmented_sentences.append(to_append)
# rd
for _ in range(num_new_per_technique):
a_words = random_deletion(words, p_rd)
if len(a_words) == 0:
continue
to_append = " ".join(a_words)
if to_append not in augmented_sentences:
augmented_sentences.append(to_append)
augmented_sentences = [sentence for sentence in augmented_sentences]
random.shuffle(augmented_sentences)
if num_aug >= 1:
augmented_sentences = augmented_sentences[:num_aug]
else:
keep_prob = num_aug / len(augmented_sentences)
augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
if sentence not in augmented_sentences:
augmented_sentences.append(sentence)
return augmented_sentences