88- `augment_file_with_synonyms(file_path, augmentation_factor, probability, progress=True)`: Augment a text file by replacing words with synonyms.
99- `insert_random_word(text, word)`: Insert a random word into the input text.
1010- `delete_random_word(text)`: Delete a random word from the input text.
11+ - `random_word_deletion(text, num_deletions=1)`: Deletes a user-specified number of random words from the text.
12+ - `swap_random_words(text)`: Swaps two random words in the text.
1113- `insert_synonym(text, word)`: Insert a synonym of the given word into the input text.
1214- `paraphrase(text)`: Paraphrase the input text.
1315- `flip_horizontal(image)`: Flip the input image horizontally.
3133import tqdm
3234from PIL import ImageEnhance
3335
34- nltk .download ("wordnet" , quiet = True )
35- nltk .download ("averaged_perceptron_tagger" , quiet = True )
36- nltk .download ("punkt" , quiet = True )
37-
3836def replace_word_with_synonym (word ):
3937 """
4038 Replace the given word with a synonym.
@@ -49,6 +47,7 @@ def replace_word_with_synonym(word):
4947 - `str`: The synonym for the word.
5048 """
5149 try :
50+ nltk .download ("wordnet" , quiet = True )
5251 synonyms = []
5352 for syn in wordnet .synsets (word ):
5453 for lemma in syn .lemmas ():
@@ -82,37 +81,20 @@ def augment_text_with_synonyms(text, augmentation_factor, probability, progress=
8281 raise ValueError ("Probability value cannot be of NoneType. Choose a float from 0 to 1" )
8382
8483 tokens = text .split ()
85- num_tokens = len (tokens )
86- processed_tokens = 0
87-
88- start_time = time .time ()
89-
90- for _ in range (augmentation_factor ):
91- augmented_tokens = []
9284
93- for token in tokens :
94- if random .random () < probability :
95- replaced_token = replace_word_with_synonym (token )
96- augmented_tokens .append (replaced_token )
97- else :
98- augmented_tokens .append (token )
85+ with tqdm .tqdm (total = augmentation_factor * len (tokens ), desc = "Augmenting Text" , disable = not progress ) as pbar :
86+ for _ in range (augmentation_factor ):
87+ augmented_tokens = []
9988
100- processed_tokens += 1
89+ for token in tokens :
90+ if random .random () < probability :
91+ replaced_token = replace_word_with_synonym (token )
92+ augmented_tokens .append (replaced_token )
93+ else :
94+ augmented_tokens .append (token )
95+ pbar .update (1 )
10196
102- # Print progress
103- if progress :
104- elapsed_time = time .time () - start_time
105- if elapsed_time == 0 :
106- elapsed_time = 1e-6 # Set a small value to avoid division by zero
107- tokens_per_sec = processed_tokens / elapsed_time
108- print (f"Progress: { processed_tokens } /{ num_tokens } tokens | { tokens_per_sec :.2f} tokens/sec" , end = "\r " )
109-
110- augmented_text .append (' ' .join (augmented_tokens ))
111-
112- # Print completion message
113- if progress :
114- print (" " * 100 , end = "\r " ) # Clear progress line
115- print ("Augmentation complete." )
97+ augmented_text .append (' ' .join (augmented_tokens ))
11698
11799 except Exception as e :
118100 print (f"An error occurred during text augmentation: { str (e )} " )
@@ -175,6 +157,7 @@ def insert_random_word(text, word):
175157 - `str`: The text with the randomly inserted word.
176158 """
177159 try :
160+ nltk .download ("punkt" , quiet = True )
178161 words = nltk .word_tokenize (text )
179162 words .insert (random .randint (0 , len (words )), word )
180163 modified_text = " " .join (words )
@@ -184,7 +167,7 @@ def insert_random_word(text, word):
184167 return text
185168
186169
187- def delete_random_word (text ):
170+ def random_word_deletion (text , num_deletions = 1 ):
188171 """
189172 Delete a random word from the input text.
190173
@@ -193,20 +176,62 @@ def delete_random_word(text):
193176
194177 Parameters:
195178 - `text` (str): The input text for word deletion.
179+ - `num_deletions` (int): The number of words to delete.
196180
197181 Returns:
198182 - `str`: The text with a randomly deleted word.
199183 """
200184 try :
185+ nltk .download ("punkt" , quiet = True )
201186 words = nltk .word_tokenize (text )
202- if len (words ) > 1 :
203- words .pop (random .randint (0 , len (words ) - 1 ))
187+ for _ in range (num_deletions ):
188+ if len (words ) > 1 :
189+ words .pop (random .randint (0 , len (words ) - 1 ))
204190 modified_text = " " .join (words )
205191 return modified_text
206192 except Exception as e :
207193 print (f"An error occurred during word deletion: { str (e )} " )
208194 return text
209195
196+ def delete_random_word (text ):
197+ """
198+ Delete a random word from the input text.
199+
200+ This function randomly deletes a word from the input text, creating variations
201+ for text augmentation or diversity.
202+
203+ Parameters:
204+ - `text` (str): The input text for word deletion.
205+
206+ Returns:
207+ - `str`: The text with a randomly deleted word.
208+ """
209+ return random_word_deletion (text , num_deletions = 1 )
210+
211+ def swap_random_words (text ):
212+ """
213+ Swaps two random words in the text.
214+
215+ This function randomly swaps two words in the input text, creating variations
216+ for text augmentation or diversity.
217+
218+ Parameters:
219+ - `text` (str): The input text for word swapping.
220+
221+ Returns:
222+ - `str`: The text with two words swapped.
223+ """
224+ try :
225+ nltk .download ("punkt" , quiet = True )
226+ words = nltk .word_tokenize (text )
227+ if len (words ) > 1 :
228+ idx1 , idx2 = random .sample (range (len (words )), 2 )
229+ words [idx1 ], words [idx2 ] = words [idx2 ], words [idx1 ]
230+ modified_text = " " .join (words )
231+ return modified_text
232+ except Exception as e :
233+ print (f"An error occurred during word swapping: { str (e )} " )
234+ return text
210235
211236def insert_synonym (text , word ):
212237 """
@@ -245,6 +270,8 @@ def paraphrase(text):
245270 - `str`: The paraphrased text.
246271 """
247272 try :
273+ nltk .download ("punkt" , quiet = True )
274+ nltk .download ("averaged_perceptron_tagger" , quiet = True )
248275 tokens = nltk .word_tokenize (text )
249276 tagged_tokens = nltk .pos_tag (tokens )
250277 paraphrased_tokens = [replace_word_with_synonym (token ) if tag .startswith (("VB" , "NN" , "JJ" )) else token for token , tag in tagged_tokens ]
0 commit comments