Skip to content

Commit 6ab152f

Browse files
authored
Merge pull request #1 from Infinitode/duplipy-improvements
Duplipy improvements, `v0.2.5`.
2 parents c544964 + 04fec13 commit 6ab152f

File tree

9 files changed

+590
-270
lines changed

9 files changed

+590
-270
lines changed

duplipy/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import duplipy
22
from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag, remove_profanity_from_text, remove_sensitive_info_from_text, remove_hate_speech_from_text, post_format_text
3-
from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay
4-
from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score
5-
from .text_analysis import analyze_sentiment
3+
from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, random_word_deletion, swap_random_words, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay
4+
from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score, sorensen_dice_coefficient, cosine_similarity_score, mean_squared_error, psnr
5+
from .text_analysis import analyze_sentiment, named_entity_recognition

duplipy/formatting.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@
2727
from nltk.tokenize import word_tokenize
2828
from nltk.stem import PorterStemmer, WordNetLemmatizer
2929

30-
nltk.download('stopwords', quiet=True)
31-
nltk.download('punkt', quiet=True)
32-
nltk.download('wordnet', quiet=True)
33-
nltk.download('averaged_perceptron_tagger', quiet=True)
34-
3530
def remove_stopwords(text):
3631
"""
3732
Remove stopwords from the input text using NLTK's stopwords.
@@ -46,6 +41,7 @@ def remove_stopwords(text):
4641
- `str`: The text without stopwords.
4742
"""
4843
try:
44+
nltk.download('stopwords', quiet=True)
4945
stop_words = set(stopwords.words('english'))
5046
tokens = text.split()
5147
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
@@ -192,6 +188,7 @@ def tokenize_text(text):
192188
Returns:
193189
- `list`: A list of tokens (words) from the input text.
194190
"""
191+
nltk.download('punkt', quiet=True)
195192
tokens = word_tokenize(text)
196193
return tokens
197194

@@ -225,6 +222,7 @@ def lemmatize_words(words):
225222
Returns:
226223
- `list`: A list of lemmatized words.
227224
"""
225+
nltk.download('wordnet', quiet=True)
228226
lemmatizer = WordNetLemmatizer()
229227
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
230228
return lemmatized_words
@@ -243,6 +241,8 @@ def pos_tag(text):
243241
- `list`: A list of tuples containing (word, tag) pairs.
244242
"""
245243
try:
244+
nltk.download('punkt', quiet=True)
245+
nltk.download('averaged_perceptron_tagger', quiet=True)
246246
tokens = nltk.word_tokenize(text)
247247
tagged_words = nltk.pos_tag(tokens)
248248
return tagged_words
@@ -262,6 +262,7 @@ def remove_profanity_from_text(text):
262262
Returns:
263263
- `text` (str): The cleaned output text.
264264
"""
265+
nltk.download('punkt', quiet=True)
265266
sentences = nltk.sent_tokenize(text)
266267
cleaned_sentences = remove_profanity(sentences, language='All')
267268
cleaned_text = ' '.join(cleaned_sentences)
@@ -280,6 +281,7 @@ def remove_sensitive_info_from_text(text):
280281
Returns:
281282
- `text` (str): The cleaned output text.
282283
"""
284+
nltk.download('punkt', quiet=True)
283285
sentences = nltk.sent_tokenize(text)
284286
cleaned_sentences = remove_sensitive_information(sentences)
285287
cleaned_text = ' '.join(cleaned_sentences)
@@ -298,6 +300,7 @@ def remove_hate_speech_from_text(text):
298300
Returns:
299301
- `text` (str): The cleaned output text.
300302
"""
303+
nltk.download('punkt', quiet=True)
301304
sentences = nltk.sent_tokenize(text)
302305
cleaned_sentences = []
303306
for sentence in sentences:

duplipy/replication.py

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
- `augment_file_with_synonyms(file_path, augmentation_factor, probability, progress=True)`: Augment a text file by replacing words with synonyms.
99
- `insert_random_word(text, word)`: Insert a random word into the input text.
1010
- `delete_random_word(text)`: Delete a random word from the input text.
11+
- `random_word_deletion(text, num_deletions=1)`: Deletes a user-specified number of random words from the text.
12+
- `swap_random_words(text)`: Swaps two random words in the text.
1113
- `insert_synonym(text, word)`: Insert a synonym of the given word into the input text.
1214
- `paraphrase(text)`: Paraphrase the input text.
1315
- `flip_horizontal(image)`: Flip the input image horizontally.
@@ -31,10 +33,6 @@
3133
import tqdm
3234
from PIL import ImageEnhance
3335

34-
nltk.download("wordnet", quiet=True)
35-
nltk.download("averaged_perceptron_tagger", quiet=True)
36-
nltk.download("punkt", quiet=True)
37-
3836
def replace_word_with_synonym(word):
3937
"""
4038
Replace the given word with a synonym.
@@ -49,6 +47,7 @@ def replace_word_with_synonym(word):
4947
- `str`: The synonym for the word.
5048
"""
5149
try:
50+
nltk.download("wordnet", quiet=True)
5251
synonyms = []
5352
for syn in wordnet.synsets(word):
5453
for lemma in syn.lemmas():
@@ -82,37 +81,20 @@ def augment_text_with_synonyms(text, augmentation_factor, probability, progress=
8281
raise ValueError("Probability value cannot be of NoneType. Choose a float from 0 to 1")
8382

8483
tokens = text.split()
85-
num_tokens = len(tokens)
86-
processed_tokens = 0
87-
88-
start_time = time.time()
89-
90-
for _ in range(augmentation_factor):
91-
augmented_tokens = []
9284

93-
for token in tokens:
94-
if random.random() < probability:
95-
replaced_token = replace_word_with_synonym(token)
96-
augmented_tokens.append(replaced_token)
97-
else:
98-
augmented_tokens.append(token)
85+
with tqdm.tqdm(total=augmentation_factor * len(tokens), desc="Augmenting Text", disable=not progress) as pbar:
86+
for _ in range(augmentation_factor):
87+
augmented_tokens = []
9988

100-
processed_tokens += 1
89+
for token in tokens:
90+
if random.random() < probability:
91+
replaced_token = replace_word_with_synonym(token)
92+
augmented_tokens.append(replaced_token)
93+
else:
94+
augmented_tokens.append(token)
95+
pbar.update(1)
10196

102-
# Print progress
103-
if progress:
104-
elapsed_time = time.time() - start_time
105-
if elapsed_time == 0:
106-
elapsed_time = 1e-6 # Set a small value to avoid division by zero
107-
tokens_per_sec = processed_tokens / elapsed_time
108-
print(f"Progress: {processed_tokens}/{num_tokens} tokens | {tokens_per_sec:.2f} tokens/sec", end="\r")
109-
110-
augmented_text.append(' '.join(augmented_tokens))
111-
112-
# Print completion message
113-
if progress:
114-
print(" " * 100, end="\r") # Clear progress line
115-
print("Augmentation complete.")
97+
augmented_text.append(' '.join(augmented_tokens))
11698

11799
except Exception as e:
118100
print(f"An error occurred during text augmentation: {str(e)}")
@@ -175,6 +157,7 @@ def insert_random_word(text, word):
175157
- `str`: The text with the randomly inserted word.
176158
"""
177159
try:
160+
nltk.download("punkt", quiet=True)
178161
words = nltk.word_tokenize(text)
179162
words.insert(random.randint(0, len(words)), word)
180163
modified_text = " ".join(words)
@@ -184,7 +167,7 @@ def insert_random_word(text, word):
184167
return text
185168

186169

187-
def delete_random_word(text):
170+
def random_word_deletion(text, num_deletions=1):
188171
"""
189172
Delete a random word from the input text.
190173
@@ -193,20 +176,62 @@ def delete_random_word(text):
193176
194177
Parameters:
195178
- `text` (str): The input text for word deletion.
179+
- `num_deletions` (int): The number of words to delete.
196180
197181
Returns:
198182
- `str`: The text with a randomly deleted word.
199183
"""
200184
try:
185+
nltk.download("punkt", quiet=True)
201186
words = nltk.word_tokenize(text)
202-
if len(words) > 1:
203-
words.pop(random.randint(0, len(words) - 1))
187+
for _ in range(num_deletions):
188+
if len(words) > 1:
189+
words.pop(random.randint(0, len(words) - 1))
204190
modified_text = " ".join(words)
205191
return modified_text
206192
except Exception as e:
207193
print(f"An error occurred during word deletion: {str(e)}")
208194
return text
209195

196+
def delete_random_word(text):
197+
"""
198+
Delete a random word from the input text.
199+
200+
This function randomly deletes a word from the input text, creating variations
201+
for text augmentation or diversity.
202+
203+
Parameters:
204+
- `text` (str): The input text for word deletion.
205+
206+
Returns:
207+
- `str`: The text with a randomly deleted word.
208+
"""
209+
return random_word_deletion(text, num_deletions=1)
210+
211+
def swap_random_words(text):
212+
"""
213+
Swaps two random words in the text.
214+
215+
This function randomly swaps two words in the input text, creating variations
216+
for text augmentation or diversity.
217+
218+
Parameters:
219+
- `text` (str): The input text for word swapping.
220+
221+
Returns:
222+
- `str`: The text with two words swapped.
223+
"""
224+
try:
225+
nltk.download("punkt", quiet=True)
226+
words = nltk.word_tokenize(text)
227+
if len(words) > 1:
228+
idx1, idx2 = random.sample(range(len(words)), 2)
229+
words[idx1], words[idx2] = words[idx2], words[idx1]
230+
modified_text = " ".join(words)
231+
return modified_text
232+
except Exception as e:
233+
print(f"An error occurred during word swapping: {str(e)}")
234+
return text
210235

211236
def insert_synonym(text, word):
212237
"""
@@ -245,6 +270,8 @@ def paraphrase(text):
245270
- `str`: The paraphrased text.
246271
"""
247272
try:
273+
nltk.download("punkt", quiet=True)
274+
nltk.download("averaged_perceptron_tagger", quiet=True)
248275
tokens = nltk.word_tokenize(text)
249276
tagged_tokens = nltk.pos_tag(tokens)
250277
paraphrased_tokens = [replace_word_with_synonym(token) if tag.startswith(("VB", "NN", "JJ")) else token for token, tag in tagged_tokens]

duplipy/similarity.py

Lines changed: 99 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,15 @@
55
- `edit_distance_score(text1, text2)`: Calculate the edit distance score between two texts.
66
- `bleu_score(reference, candidate)`: Calculate the BLEU score between a reference sentence and a candidate sentence.
77
- `jaccard_similarity_score(text1, text2)`: Calculate Jaccard similarity between two texts.
8+
- `sorensen_dice_coefficient(text1, text2)`: Calculate the Sorensen-Dice coefficient between two texts.
9+
- `cosine_similarity_score(text1, text2)`: Calculate the cosine similarity between two texts.
10+
- `mean_squared_error(image1, image2)`: Calculate the mean squared error (MSE) between two images.
11+
- `psnr(image1, image2)`: Calculate the peak signal-to-noise ratio (PSNR) between two images.
812
"""
9-
13+
import collections
14+
import math
15+
import numpy as np
16+
from PIL import Image
1017
import nltk
1118
from nltk.metrics import distance
1219
from nltk.translate.bleu_score import sentence_bleu
@@ -50,6 +57,7 @@ def bleu_score(reference, candidate):
5057
- `float`: The BLEU score. The score ranges from 0 (no similarity) to 1 (perfect match).
5158
"""
5259
try:
60+
nltk.download('punkt', quiet=True)
5361
# Tokenize the reference and candidate sentences
5462
reference_tokens = nltk.word_tokenize(reference)
5563
candidate_tokens = nltk.word_tokenize(candidate)
@@ -84,4 +92,93 @@ def jaccard_similarity_score(text1, text2):
8492
intersection = len(set1.intersection(set2))
8593
union = len(set1.union(set2))
8694
similarity_score = intersection / union if union != 0 else 0
87-
return similarity_score
95+
return similarity_score
96+
97+
def sorensen_dice_coefficient(text1, text2):
98+
"""
99+
Calculate the Sorensen-Dice coefficient between two texts.
100+
101+
The Sorensen-Dice coefficient is a statistic used for comparing the
102+
similarity of two samples.
103+
104+
Parameters:
105+
- `text1` (str): The first text for comparison.
106+
- `text2` (str): The second text for comparison.
107+
108+
Returns:
109+
- `float`: The Sorensen-Dice coefficient between the two texts.
110+
"""
111+
set1 = set(text1.split())
112+
set2 = set(text2.split())
113+
intersection = len(set1.intersection(set2))
114+
return 2 * intersection / (len(set1) + len(set2))
115+
116+
def cosine_similarity_score(text1, text2):
117+
"""
118+
Calculate the cosine similarity between two texts.
119+
120+
Cosine similarity is a measure of similarity between two non-zero vectors
121+
of an inner product space that measures the cosine of the angle between them.
122+
123+
Parameters:
124+
- `text1` (str): The first text for comparison.
125+
- `text2` (str): The second text for comparison.
126+
127+
Returns:
128+
- `float`: The cosine similarity score between the two texts.
129+
"""
130+
nltk.download('punkt', quiet=True)
131+
vec1 = collections.Counter(nltk.word_tokenize(text1))
132+
vec2 = collections.Counter(nltk.word_tokenize(text2))
133+
134+
intersection = set(vec1.keys()) & set(vec2.keys())
135+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
136+
137+
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
138+
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
139+
denominator = math.sqrt(sum1) * math.sqrt(sum2)
140+
141+
if not denominator:
142+
return 0.0
143+
return float(numerator) / denominator
144+
145+
def mean_squared_error(image1, image2):
146+
"""
147+
Calculate the mean squared error (MSE) between two images.
148+
149+
MSE is a measure of the average squared difference between the estimated
150+
values and the actual value.
151+
152+
Parameters:
153+
- `image1` (PIL.Image.Image): The first image for comparison.
154+
- `image2` (PIL.Image.Image): The second image for comparison.
155+
156+
Returns:
157+
- `float`: The mean squared error between the two images.
158+
"""
159+
image1 = np.array(image1)
160+
image2 = np.array(image2)
161+
err = np.sum((image1.astype("float") - image2.astype("float")) ** 2)
162+
err /= float(image1.shape[0] * image1.shape[1])
163+
return err
164+
165+
def psnr(image1, image2):
166+
"""
167+
Calculate the peak signal-to-noise ratio (PSNR) between two images.
168+
169+
PSNR is the ratio between the maximum possible power of a signal and the
170+
power of corrupting noise that affects the fidelity of its representation.
171+
172+
Parameters:
173+
- `image1` (PIL.Image.Image): The first image for comparison.
174+
- `image2` (PIL.Image.Image): The second image for comparison.
175+
176+
Returns:
177+
- `float`: The peak signal-to-noise ratio between the two images.
178+
"""
179+
mse = mean_squared_error(image1, image2)
180+
if mse == 0:
181+
return 100
182+
max_pixel = 255.0
183+
psnr = 20 * math.log10(max_pixel / math.sqrt(mse))
184+
return psnr

0 commit comments

Comments
 (0)