Infinitode
diff --git a/‎duplipy/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎duplipy/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎duplipy/formatting.py‎
Lines changed: 8 additions & 5 deletions b/‎duplipy/formatting.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎duplipy/replication.py‎
Lines changed: 62 additions & 35 deletions b/‎duplipy/replication.py‎
Lines changed: 62 additions & 35 deletions
diff --git a/‎duplipy/similarity.py‎
Lines changed: 99 additions & 2 deletions b/‎duplipy/similarity.py‎
Lines changed: 99 additions & 2 deletions
@@ -1,5 +1,5 @@
 import duplipy
 from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag, remove_profanity_from_text, remove_sensitive_info_from_text, remove_hate_speech_from_text, post_format_text
-from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay
-from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score
-from .text_analysis import analyze_sentiment
+from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, random_word_deletion, swap_random_words, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay
+from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score, sorensen_dice_coefficient, cosine_similarity_score, mean_squared_error, psnr
+from .text_analysis import analyze_sentiment, named_entity_recognition
@@ -27,11 +27,6 @@
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
 
-nltk.download('stopwords', quiet=True)
-nltk.download('punkt', quiet=True)
-nltk.download('wordnet', quiet=True)
-nltk.download('averaged_perceptron_tagger', quiet=True)
-
 def remove_stopwords(text):
     """
     Remove stopwords from the input text using NLTK's stopwords.
@@ -46,6 +41,7 @@ def remove_stopwords(text):
     - `str`: The text without stopwords.
     """
     try:
+        nltk.download('stopwords', quiet=True)
         stop_words = set(stopwords.words('english'))
         tokens = text.split()
         filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
@@ -192,6 +188,7 @@ def tokenize_text(text):
     Returns:
     - `list`: A list of tokens (words) from the input text.
     """
+    nltk.download('punkt', quiet=True)
     tokens = word_tokenize(text)
     return tokens
 
@@ -225,6 +222,7 @@ def lemmatize_words(words):
     Returns:
     - `list`: A list of lemmatized words.
     """
+    nltk.download('wordnet', quiet=True)
     lemmatizer = WordNetLemmatizer()
     lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
     return lemmatized_words
@@ -243,6 +241,8 @@ def pos_tag(text):
     - `list`: A list of tuples containing (word, tag) pairs.
     """
     try:
+        nltk.download('punkt', quiet=True)
+        nltk.download('averaged_perceptron_tagger', quiet=True)
         tokens = nltk.word_tokenize(text)
         tagged_words = nltk.pos_tag(tokens)
         return tagged_words
@@ -262,6 +262,7 @@ def remove_profanity_from_text(text):
     Returns:
     - `text` (str): The cleaned output text.
     """
+    nltk.download('punkt', quiet=True)
     sentences = nltk.sent_tokenize(text)
     cleaned_sentences = remove_profanity(sentences, language='All')
     cleaned_text = ' '.join(cleaned_sentences)
@@ -280,6 +281,7 @@ def remove_sensitive_info_from_text(text):
     Returns:
     - `text` (str): The cleaned output text.
     """
+    nltk.download('punkt', quiet=True)
     sentences = nltk.sent_tokenize(text)
     cleaned_sentences = remove_sensitive_information(sentences)
     cleaned_text = ' '.join(cleaned_sentences)
@@ -298,6 +300,7 @@ def remove_hate_speech_from_text(text):
     Returns:
     - `text` (str): The cleaned output text.
     """
+    nltk.download('punkt', quiet=True)
     sentences = nltk.sent_tokenize(text)
     cleaned_sentences = []
     for sentence in sentences:
 
@@ -8,6 +8,8 @@
 - `augment_file_with_synonyms(file_path, augmentation_factor, probability, progress=True)`: Augment a text file by replacing words with synonyms.
 - `insert_random_word(text, word)`: Insert a random word into the input text.
 - `delete_random_word(text)`: Delete a random word from the input text.
+- `random_word_deletion(text, num_deletions=1)`: Deletes a user-specified number of random words from the text.
+- `swap_random_words(text)`: Swaps two random words in the text.
 - `insert_synonym(text, word)`: Insert a synonym of the given word into the input text.
 - `paraphrase(text)`: Paraphrase the input text.
 - `flip_horizontal(image)`: Flip the input image horizontally.
@@ -31,10 +33,6 @@
 import tqdm
 from PIL import ImageEnhance
 
-nltk.download("wordnet", quiet=True)
-nltk.download("averaged_perceptron_tagger", quiet=True)
-nltk.download("punkt", quiet=True)
-
 def replace_word_with_synonym(word):
     """
     Replace the given word with a synonym.
@@ -49,6 +47,7 @@ def replace_word_with_synonym(word):
     - `str`: The synonym for the word.
     """
     try:
+        nltk.download("wordnet", quiet=True)
         synonyms = []
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
@@ -82,37 +81,20 @@ def augment_text_with_synonyms(text, augmentation_factor, probability, progress=
             raise ValueError("Probability value cannot be of NoneType. Choose a float from 0 to 1")
 
         tokens = text.split()
-        num_tokens = len(tokens)
-        processed_tokens = 0
-
-        start_time = time.time()
-
-        for _ in range(augmentation_factor):
-            augmented_tokens = []
 
-            for token in tokens:
-                if random.random() < probability:
-                    replaced_token = replace_word_with_synonym(token)
-                    augmented_tokens.append(replaced_token)
-                else:
-                    augmented_tokens.append(token)
+        with tqdm.tqdm(total=augmentation_factor * len(tokens), desc="Augmenting Text", disable=not progress) as pbar:
+            for _ in range(augmentation_factor):
+                augmented_tokens = []
 
-                processed_tokens += 1
+                for token in tokens:
+                    if random.random() < probability:
+                        replaced_token = replace_word_with_synonym(token)
+                        augmented_tokens.append(replaced_token)
+                    else:
+                        augmented_tokens.append(token)
+                    pbar.update(1)
 
-                # Print progress
-                if progress:
-                    elapsed_time = time.time() - start_time
-                    if elapsed_time == 0:
-                        elapsed_time = 1e-6  # Set a small value to avoid division by zero
-                    tokens_per_sec = processed_tokens / elapsed_time
-                    print(f"Progress: {processed_tokens}/{num_tokens} tokens | {tokens_per_sec:.2f} tokens/sec", end="\r")
-
-            augmented_text.append(' '.join(augmented_tokens))
-        
-        # Print completion message
-        if progress:
-            print(" " * 100, end="\r")  # Clear progress line
-            print("Augmentation complete.")
+                augmented_text.append(' '.join(augmented_tokens))
 
     except Exception as e:
         print(f"An error occurred during text augmentation: {str(e)}")
@@ -175,6 +157,7 @@ def insert_random_word(text, word):
     - `str`: The text with the randomly inserted word.
     """
     try:
+        nltk.download("punkt", quiet=True)
         words = nltk.word_tokenize(text)
         words.insert(random.randint(0, len(words)), word)
         modified_text = " ".join(words)
@@ -184,7 +167,7 @@ def insert_random_word(text, word):
         return text
 
 
-def delete_random_word(text):
+def random_word_deletion(text, num_deletions=1):
     """
     Delete a random word from the input text.
 
@@ -193,20 +176,62 @@ def delete_random_word(text):
     
     Parameters:
     - `text` (str): The input text for word deletion.
+    - `num_deletions` (int): The number of words to delete.
 
     Returns:
     - `str`: The text with a randomly deleted word.
     """
     try:
+        nltk.download("punkt", quiet=True)
         words = nltk.word_tokenize(text)
-        if len(words) > 1:
-            words.pop(random.randint(0, len(words) - 1))
+        for _ in range(num_deletions):
+            if len(words) > 1:
+                words.pop(random.randint(0, len(words) - 1))
         modified_text = " ".join(words)
         return modified_text
     except Exception as e:
         print(f"An error occurred during word deletion: {str(e)}")
         return text
 
+def delete_random_word(text):
+    """
+    Delete a random word from the input text.
+
+    This function randomly deletes a word from the input text, creating variations
+    for text augmentation or diversity.
+
+    Parameters:
+    - `text` (str): The input text for word deletion.
+
+    Returns:
+    - `str`: The text with a randomly deleted word.
+    """
+    return random_word_deletion(text, num_deletions=1)
+
+def swap_random_words(text):
+    """
+    Swaps two random words in the text.
+
+    This function randomly swaps two words in the input text, creating variations
+    for text augmentation or diversity.
+
+    Parameters:
+    - `text` (str): The input text for word swapping.
+
+    Returns:
+    - `str`: The text with two words swapped.
+    """
+    try:
+        nltk.download("punkt", quiet=True)
+        words = nltk.word_tokenize(text)
+        if len(words) > 1:
+            idx1, idx2 = random.sample(range(len(words)), 2)
+            words[idx1], words[idx2] = words[idx2], words[idx1]
+        modified_text = " ".join(words)
+        return modified_text
+    except Exception as e:
+        print(f"An error occurred during word swapping: {str(e)}")
+        return text
 
 def insert_synonym(text, word):
     """
@@ -245,6 +270,8 @@ def paraphrase(text):
     - `str`: The paraphrased text.
     """
     try:
+        nltk.download("punkt", quiet=True)
+        nltk.download("averaged_perceptron_tagger", quiet=True)
         tokens = nltk.word_tokenize(text)
         tagged_tokens = nltk.pos_tag(tokens)
         paraphrased_tokens = [replace_word_with_synonym(token) if tag.startswith(("VB", "NN", "JJ")) else token for token, tag in tagged_tokens]
 
@@ -5,8 +5,15 @@
 - `edit_distance_score(text1, text2)`: Calculate the edit distance score between two texts.
 - `bleu_score(reference, candidate)`: Calculate the BLEU score between a reference sentence and a candidate sentence.
 - `jaccard_similarity_score(text1, text2)`: Calculate Jaccard similarity between two texts.
+- `sorensen_dice_coefficient(text1, text2)`: Calculate the Sorensen-Dice coefficient between two texts.
+- `cosine_similarity_score(text1, text2)`: Calculate the cosine similarity between two texts.
+- `mean_squared_error(image1, image2)`: Calculate the mean squared error (MSE) between two images.
+- `psnr(image1, image2)`: Calculate the peak signal-to-noise ratio (PSNR) between two images.
 """
-
+import collections
+import math
+import numpy as np
+from PIL import Image
 import nltk
 from nltk.metrics import distance
 from nltk.translate.bleu_score import sentence_bleu
@@ -50,6 +57,7 @@ def bleu_score(reference, candidate):
     - `float`: The BLEU score. The score ranges from 0 (no similarity) to 1 (perfect match).
     """
     try:
+        nltk.download('punkt', quiet=True)
         # Tokenize the reference and candidate sentences
         reference_tokens = nltk.word_tokenize(reference)
         candidate_tokens = nltk.word_tokenize(candidate)
@@ -84,4 +92,93 @@ def jaccard_similarity_score(text1, text2):
     intersection = len(set1.intersection(set2))
     union = len(set1.union(set2))
     similarity_score = intersection / union if union != 0 else 0
-    return similarity_score
+    return similarity_score
+
+def sorensen_dice_coefficient(text1, text2):
+    """
+    Calculate the Sorensen-Dice coefficient between two texts.
+
+    The Sorensen-Dice coefficient is a statistic used for comparing the
+    similarity of two samples.
+
+    Parameters:
+    - `text1` (str): The first text for comparison.
+    - `text2` (str): The second text for comparison.
+
+    Returns:
+    - `float`: The Sorensen-Dice coefficient between the two texts.
+    """
+    set1 = set(text1.split())
+    set2 = set(text2.split())
+    intersection = len(set1.intersection(set2))
+    return 2 * intersection / (len(set1) + len(set2))
+
+def cosine_similarity_score(text1, text2):
+    """
+    Calculate the cosine similarity between two texts.
+
+    Cosine similarity is a measure of similarity between two non-zero vectors
+    of an inner product space that measures the cosine of the angle between them.
+
+    Parameters:
+    - `text1` (str): The first text for comparison.
+    - `text2` (str): The second text for comparison.
+
+    Returns:
+    - `float`: The cosine similarity score between the two texts.
+    """
+    nltk.download('punkt', quiet=True)
+    vec1 = collections.Counter(nltk.word_tokenize(text1))
+    vec2 = collections.Counter(nltk.word_tokenize(text2))
+
+    intersection = set(vec1.keys()) & set(vec2.keys())
+    numerator = sum([vec1[x] * vec2[x] for x in intersection])
+
+    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
+    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
+    denominator = math.sqrt(sum1) * math.sqrt(sum2)
+
+    if not denominator:
+        return 0.0
+    return float(numerator) / denominator
+
+def mean_squared_error(image1, image2):
+    """
+    Calculate the mean squared error (MSE) between two images.
+
+    MSE is a measure of the average squared difference between the estimated
+    values and the actual value.
+
+    Parameters:
+    - `image1` (PIL.Image.Image): The first image for comparison.
+    - `image2` (PIL.Image.Image): The second image for comparison.
+
+    Returns:
+    - `float`: The mean squared error between the two images.
+    """
+    image1 = np.array(image1)
+    image2 = np.array(image2)
+    err = np.sum((image1.astype("float") - image2.astype("float")) ** 2)
+    err /= float(image1.shape[0] * image1.shape[1])
+    return err
+
+def psnr(image1, image2):
+    """
+    Calculate the peak signal-to-noise ratio (PSNR) between two images.
+
+    PSNR is the ratio between the maximum possible power of a signal and the
+    power of corrupting noise that affects the fidelity of its representation.
+
+    Parameters:
+    - `image1` (PIL.Image.Image): The first image for comparison.
+    - `image2` (PIL.Image.Image): The second image for comparison.
+
+    Returns:
+    - `float`: The peak signal-to-noise ratio between the two images.
+    """
+    mse = mean_squared_error(image1, image2)
+    if mse == 0:
+        return 100
+    max_pixel = 255.0
+    psnr = 20 * math.log10(max_pixel / math.sqrt(mse))
+    return psnr