Skip to content

Commit a21fe85

Browse files
author
SOCIALSCIENCEai
committed
add spellcheck class
1 parent 501709c commit a21fe85

File tree

1 file changed

+80
-1
lines changed

1 file changed

+80
-1
lines changed

sentibank/utils.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from spacymoji import Emoji
99
from sentibank.dict_arXiv import emos
1010
import re
11+
import enchant
1112
from sentibank import archive
1213

1314
load = archive.load()
@@ -339,7 +340,82 @@ def summarise_lex_dict(self, lexicon_dictionary: dict):
339340

340341
return pprint(summary)
341342

343+
class spellcheck:
344+
"""
345+
A class for spell-checking sentences.
346+
347+
Attributes:
348+
spellchecker (enchant.Dict): An enchant dictionary for English.
349+
350+
Methods:
351+
shorten_word(self, word: str) -> str:
352+
Shortens words with three or more consecutive identical alphabets to two consecutive identical alphabets.
353+
354+
sentence(self, sentence: str) -> str:
355+
Spell-checks the given sentence, correcting misspelled words. Shortens words with three or more consecutive identical alphabets before spell-checking.
356+
Returns the spell-checked sentence in lowercase.
357+
358+
Example:
359+
spellchecker = spellcheck()
360+
corrected_sentence = spellchecker.sentence("I am sooooo happppyyyy")
361+
print(corrected_sentence)
362+
"""
363+
def __init__(self):
364+
self.spellchecker = enchant.Dict("en_US")
365+
366+
def shorten_word(self, word:str):
367+
"""
368+
Shortens words with three or more consecutive identical alphabets to two consecutive identical alphabets.
342369
370+
Args:
371+
word (str): The word to be shortened.
372+
373+
Returns:
374+
str: The shortened word.
375+
"""
376+
# Find sections with three or more consecutive identical alphabets
377+
matches = re.findall(r'(\w)\1{2,}', word)
378+
379+
# Shorten each section to two alphabets
380+
for match in matches:
381+
replacement = match[:2]
382+
word = re.sub(r'(\w)\1{2,}', replacement * 2, word, count=1)
383+
384+
return word
385+
386+
def sentence(self, sentence:str):
387+
"""
388+
Spell-checks the given sentence, correcting misspelled words.
389+
Shortens words with three or more consecutive identical alphabets before spell-checking.
390+
Returns the spell-checked sentence in lowercase.
391+
392+
Args:
393+
sentence (str): The sentence to be spell-checked.
394+
395+
Returns:
396+
str: The spell-checked sentence.
397+
"""
398+
words = sentence.split()
399+
spellchecked_words = []
400+
401+
for word in words:
402+
# Shorten word if it has three consecutive identical alphabets
403+
if re.search(r'(\w)\1{2,}', word):
404+
word = self.shorten_word(word)
405+
print(word)
406+
if word == "soo": #Add more words that enchant wrongly spell checks
407+
suggestions = ["so"]
408+
else:
409+
# Spell check the word
410+
suggestions = self.spellchecker.suggest(word)
411+
if suggestions:
412+
spellchecked_words.append(suggestions[0])
413+
else:
414+
spellchecked_words.append(word)
415+
else:
416+
spellchecked_words.append(word)
417+
418+
return ' '.join(spellchecked_words).lower()
343419
class analyze:
344420
"""
345421
A class for sentiment analysis with lexicon dictionaries.
@@ -358,7 +434,7 @@ class analyze:
358434
Returns a summary of sentiment scores or labels depending on the dictionary used.
359435
"""
360436
def __init__(self):
361-
pass
437+
self.spellcheck = spellcheck()
362438

363439
def dictionary(self, dictionary: dict or str = None):
364440
"""
@@ -386,6 +462,7 @@ def dictionary(self, dictionary: dict or str = None):
386462
def sentiment(self, text: str, dictionary: str = None):
387463
"""
388464
Performs bag-of-words sentiment analysis on the given text using the specified lexicon dictionary.
465+
It checks spelling of words if a word has three consecutive identical alphabets (e.g. "happppyyyy")
389466
390467
Parameters:
391468
text (str): The input text for sentiment analysis.
@@ -397,6 +474,8 @@ def sentiment(self, text: str, dictionary: str = None):
397474
If the lexicon dictionary is score-based, returns the total sentiment score as a float.
398475
If the lexicon dictionary is label-based, returns a dictionary with sentiment class counts.
399476
"""
477+
text = self.spellcheck.sentence(text)
478+
400479
avaliable_dictionary = [
401480
"MASTER_v2022",
402481
"VADER_v2014",

0 commit comments

Comments
 (0)