From c82fdd19dfe43561b07b9082d3a4a0d13fbb3bc5 Mon Sep 17 00:00:00 2001 From: tanmay-punekar <52330061+tanmay-punekar@users.noreply.github.com> Date: Wed, 1 Apr 2020 18:24:28 +0530 Subject: [PATCH 1/4] Add files via upload --- newspaper/resources/text/stopwords-mr.txt | 99 +++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 newspaper/resources/text/stopwords-mr.txt diff --git a/newspaper/resources/text/stopwords-mr.txt b/newspaper/resources/text/stopwords-mr.txt new file mode 100644 index 000000000..27949fafb --- /dev/null +++ b/newspaper/resources/text/stopwords-mr.txt @@ -0,0 +1,99 @@ +अधिक +अनेक +अशी +असलयाचे +असलेल्या +असा +असून +असे +आज +आणि +आता +आपल्या +आला +आली +आले +आहे +आहेत +एक +एका +कमी +करणयात +करून +का +काम +काय +काही +किवा +की +केला +केली +केले +कोटी +गेल्या +घेऊन +जात +झाला +झाली +झाले +झालेल्या +टा +डॉ +तर +तरी +तसेच +ता +ती +तीन +ते +तो +त्या +त्याचा +त्याची +त्याच्या +त्याना +त्यानी +त्यामुळे +त्री +दिली +दोन +न +नाही +निर्ण्य +पण +पम +परयतन +पाटील +म +मात्र +माहिती +मी +मुबी +म्हणजे +म्हणाले +म्हणून +या +याचा +याची +याच्या +याना +यानी +येणार +येत +येथील +येथे +लाख +व +व्यकत +सर्व +सागित्ले +सुरू +हजार +हा +ही +हे +होणार +होत +होता +होती +होते \ No newline at end of file From 08592dd2ce729e9867667b3ee52e5d5f841fc089 Mon Sep 17 00:00:00 2001 From: tanmay-punekar <52330061+tanmay-punekar@users.noreply.github.com> Date: Wed, 1 Apr 2020 18:29:33 +0530 Subject: [PATCH 2/4] Update article.py --- newspaper/article.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/newspaper/article.py b/newspaper/article.py index df0d9c435..789e8515d 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -12,6 +12,7 @@ import requests +from googletrans import Translator from . import images from . import network from . import nlp @@ -368,6 +369,12 @@ def is_media_news(self): if s in self.url: return True return False + + def translater(self,dest = 'en'): + self.dest = dest + translator = Translator() + result = translator.translate(self.text,self.dest) + return result.text def nlp(self): """Keyword extraction wrapper From a6087251e6a50badd06a95474f2dda1a8f629290 Mon Sep 17 00:00:00 2001 From: tanmay-punekar <52330061+tanmay-punekar@users.noreply.github.com> Date: Wed, 1 Apr 2020 18:31:30 +0530 Subject: [PATCH 3/4] Update utils.py --- newspaper/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/newspaper/utils.py b/newspaper/utils.py index bfa441482..f0ede42cb 100644 --- a/newspaper/utils.py +++ b/newspaper/utils.py @@ -393,6 +393,7 @@ def print_available_languages(): 'uk': 'Ukrainian', 'vi': 'Vietnamese', 'zh': 'Chinese', + 'mr': 'Marathi' } codes = get_available_languages() From b5262a7d6d8b09e89d295e2ed95be949a6b6b30b Mon Sep 17 00:00:00 2001 From: tanmay-punekar <52330061+tanmay-punekar@users.noreply.github.com> Date: Wed, 1 Apr 2020 18:38:03 +0530 Subject: [PATCH 4/4] Update text.py --- newspaper/text.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/newspaper/text.py b/newspaper/text.py index 23b4c6b1e..1046f46dd 100644 --- a/newspaper/text.py +++ b/newspaper/text.py @@ -182,6 +182,30 @@ def get_stopword_count(self, content): ws.set_stopword_count(len(overlapping_stopwords)) ws.set_stop_words(overlapping_stopwords) return ws + +class StopWordsMarathi(StopWords): + """Marathi segmentation + """ + def __init__(self, language='mr'): + super(StopWordsHindi, self).__init__(language='mr') + + def get_stopword_count(self, content): + if not content: + return WordStats() + ws = WordStats() + stripped_input = self.remove_punctuation(content) + candidate_words = self.candidate_words(stripped_input) + overlapping_stopwords = [] + c = 0 + for w in candidate_words: + c += 1 + for stop_word in self.STOP_WORDS: + overlapping_stopwords.append(stop_word) + + ws.set_word_count(c) + ws.set_stopword_count(len(overlapping_stopwords)) + ws.set_stop_words(overlapping_stopwords) + return ws class StopWordsJapanese(StopWords):