88from spacymoji import Emoji
99from sentibank .dict_arXiv import emos
1010import re
11+ import enchant
1112from sentibank import archive
1213
1314load = archive .load ()
@@ -339,7 +340,82 @@ def summarise_lex_dict(self, lexicon_dictionary: dict):
339340
340341 return pprint (summary )
341342
343+ class spellcheck :
344+ """
345+ A class for spell-checking sentences.
346+
347+ Attributes:
348+ spellchecker (enchant.Dict): An enchant dictionary for English.
349+
350+ Methods:
351+ shorten_word(self, word: str) -> str:
352+ Shortens words with three or more consecutive identical alphabets to two consecutive identical alphabets.
353+
354+ sentence(self, sentence: str) -> str:
355+ Spell-checks the given sentence, correcting misspelled words. Shortens words with three or more consecutive identical alphabets before spell-checking.
356+ Returns the spell-checked sentence in lowercase.
357+
358+ Example:
359+ spellchecker = spellcheck()
360+ corrected_sentence = spellchecker.sentence("I am sooooo happppyyyy")
361+ print(corrected_sentence)
362+ """
363+ def __init__ (self ):
364+ self .spellchecker = enchant .Dict ("en_US" )
365+
366+ def shorten_word (self , word :str ):
367+ """
368+ Shortens words with three or more consecutive identical alphabets to two consecutive identical alphabets.
342369
370+ Args:
371+ word (str): The word to be shortened.
372+
373+ Returns:
374+ str: The shortened word.
375+ """
376+ # Find sections with three or more consecutive identical alphabets
377+ matches = re .findall (r'(\w)\1{2,}' , word )
378+
379+ # Shorten each section to two alphabets
380+ for match in matches :
381+ replacement = match [:2 ]
382+ word = re .sub (r'(\w)\1{2,}' , replacement * 2 , word , count = 1 )
383+
384+ return word
385+
386+ def sentence (self , sentence :str ):
387+ """
388+ Spell-checks the given sentence, correcting misspelled words.
389+ Shortens words with three or more consecutive identical alphabets before spell-checking.
390+ Returns the spell-checked sentence in lowercase.
391+
392+ Args:
393+ sentence (str): The sentence to be spell-checked.
394+
395+ Returns:
396+ str: The spell-checked sentence.
397+ """
398+ words = sentence .split ()
399+ spellchecked_words = []
400+
401+ for word in words :
402+ # Shorten word if it has three consecutive identical alphabets
403+ if re .search (r'(\w)\1{2,}' , word ):
404+ word = self .shorten_word (word )
405+ print (word )
406+ if word == "soo" : #Add more words that enchant wrongly spell checks
407+ suggestions = ["so" ]
408+ else :
409+ # Spell check the word
410+ suggestions = self .spellchecker .suggest (word )
411+ if suggestions :
412+ spellchecked_words .append (suggestions [0 ])
413+ else :
414+ spellchecked_words .append (word )
415+ else :
416+ spellchecked_words .append (word )
417+
418+ return ' ' .join (spellchecked_words ).lower ()
343419class analyze :
344420 """
345421 A class for sentiment analysis with lexicon dictionaries.
@@ -358,7 +434,7 @@ class analyze:
358434 Returns a summary of sentiment scores or labels depending on the dictionary used.
359435 """
360436 def __init__ (self ):
361- pass
437+ self . spellcheck = spellcheck ()
362438
363439 def dictionary (self , dictionary : dict or str = None ):
364440 """
@@ -386,6 +462,7 @@ def dictionary(self, dictionary: dict or str = None):
386462 def sentiment (self , text : str , dictionary : str = None ):
387463 """
388464 Performs bag-of-words sentiment analysis on the given text using the specified lexicon dictionary.
465+ It checks spelling of words if a word has three consecutive identical alphabets (e.g. "happppyyyy")
389466
390467 Parameters:
391468 text (str): The input text for sentiment analysis.
@@ -397,6 +474,8 @@ def sentiment(self, text: str, dictionary: str = None):
397474 If the lexicon dictionary is score-based, returns the total sentiment score as a float.
398475 If the lexicon dictionary is label-based, returns a dictionary with sentiment class counts.
399476 """
477+ text = self .spellcheck .sentence (text )
478+
400479 avaliable_dictionary = [
401480 "MASTER_v2022" ,
402481 "VADER_v2014" ,
0 commit comments