1- import logging
2-
3- logging .basicConfig (format = '%(asctime)s - %(message)s' ,
4- datefmt = '%d-%b-%y %H:%M:%S' )
5- logging .getLogger ().setLevel (logging .INFO )
6-
1+ import math
2+ import re
73import pandas as pd
84import numpy as np
95import tqdm
106import nltk
117from nltk import sent_tokenize , regexp_tokenize
12- import math
13- import re
8+
9+ import logging
10+
11+ logging .basicConfig (format = '%(asctime)s - %(message)s' ,
12+ datefmt = '%d-%b-%y %H:%M:%S' )
13+ logging .getLogger ().setLevel (logging .INFO )
1414
1515def sylco (word ):
1616 word = word .lower ()
@@ -26,12 +26,12 @@ def sylco(word):
2626 # 2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
2727 # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
2828 # 4) check if consecutive vowels exists, triplets or pairs, count them as one.
29- doubleAndtripple = len (re .findall (r'[eaoui][eaoui]' , word ))
29+ double_and_triple = len (re .findall (r'[eaoui][eaoui]' , word ))
3030 tripple = len (re .findall (r'[eaoui][eaoui][eaoui]' , word ))
31- disc += doubleAndtripple + tripple
31+ disc += double_and_triple + tripple
3232
3333 # 5) count remaining vowels in word.
34- numVowels = len (re .findall (r'[eaoui]' , word ))
34+ num_vowels = len (re .findall (r'[eaoui]' , word ))
3535
3636 # 9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
3737 if word [:3 ] == "tri" and len (word ) > 3 and word [3 ] in "aeoui" :
@@ -45,113 +45,109 @@ def sylco(word):
4545 # (These rules would be added if needed.)
4646
4747 # calculate the output
48- return numVowels - disc + syls
48+ return num_vowels - disc + syls
4949
5050
5151def gfi (text ):
5252 # Gunning Fog Index
5353 word_tokens = regexp_tokenize (text , r'\w+' )
54- lengthW = len (word_tokens )
54+ length_w = len (word_tokens )
5555 sents = sent_tokenize (text )
56- lengthS = len (sents )
56+ length_s = len (sents )
5757
58- # Check for division by zero (if there are no sentences)
59- if lengthS == 0 or lengthW == 0 :
58+ if length_s == 0 or length_w == 0 :
6059 return 0
6160
6261 long_words = [w for w in word_tokens if len (w ) > 7 ]
63- pl = len (long_words ) / lengthW * 100 # percentage long words
64- GFI = 0.4 * ((lengthW / lengthS ) + pl )
65- return GFI
62+ pl = len (long_words ) / length_w * 100 # percentage long words
63+ gfi = 0.4 * ((length_w / length_s ) + pl )
64+ return gfi
6665
6766
6867def fre (text ):
6968 # Flesch Reading Ease
7069 word_tokens = regexp_tokenize (text , r'\w+' )
71- lengthW = len (word_tokens )
70+ length_w = len (word_tokens )
7271 sents = sent_tokenize (text )
73- lengthS = len (sents )
72+ length_s = len (sents )
7473
75- # Avoid division by zero if no sentences or no words
76- if lengthS == 0 or lengthW == 0 :
74+ if length_s == 0 or length_w == 0 :
7775 return 0
7876
7977 ts = 0 # total syllables
8078 for word in word_tokens :
8179 ts += sylco (word )
8280
83- FRE = 206.835 - 1.015 * (lengthW / lengthS ) - 84.6 * (ts / lengthW )
84- return FRE
81+ fre = 206.835 - 1.015 * (length_w / length_s ) - 84.6 * (ts / length_w )
82+ return fre
8583
8684
8785def fkgl (text ):
8886 # Flesch–Kincaid Grade Level
8987 word_tokens = regexp_tokenize (text , r'\w+' )
90- lengthW = len (word_tokens )
88+ length_w = len (word_tokens )
9189 sents = sent_tokenize (text )
92- lengthS = len (sents )
90+ length_s = len (sents )
9391
94- # Avoid division by zero if no sentences or no words
95- if lengthS == 0 or lengthW == 0 :
92+ if length_s == 0 or length_w == 0 :
9693 return 0
9794
9895 ts = 0 # total syllables
9996 for word in word_tokens :
10097 ts += sylco (word )
10198
102- FKGL = 0.39 * (lengthW / lengthS ) + 11.8 * (ts / lengthW ) - 15.59
103- return FKGL
99+ fkgl = 0.39 * (length_w / length_s ) + 11.8 * (ts / length_w ) - 15.59
100+ return fkgl
104101
105102
106103def dcrf (text ):
107104 # Dale–Chall Readability Formula
108105 word_tokens = regexp_tokenize (text , r'\w+' )
109- lengthW = len (word_tokens )
106+ length_w = len (word_tokens )
110107 sents = sent_tokenize (text )
111- lengthS = len (sents )
108+ length_s = len (sents )
112109
113- # Avoid division by zero
114- if lengthS == 0 or lengthW == 0 :
110+ if length_s == 0 or length_w == 0 :
115111 return 0
116112
117113 long_words = [w for w in word_tokens if len (w ) > 7 ]
118- pl = len (long_words ) / lengthW * 100 # percentage of long words
114+ pl = len (long_words ) / length_w * 100 # percentage of long words
119115
120- DCRF = 0.1579 * pl + 0.0496 * (lengthW / lengthS )
121- return DCRF
116+ dcrf = 0.1579 * pl + 0.0496 * (length_w / length_s )
117+ return dcrf
122118
123119
124120def ari (text ):
125121 # Automated Readability Index
126122 word_tokens = regexp_tokenize (text , r'\w+' )
127- lengthW = len (word_tokens )
123+ length_w = len (word_tokens )
128124 sents = sent_tokenize (text )
129- lengthS = len (sents )
130- lengthCH = len (text )
125+ length_s = len (sents )
126+ length_ch = len (text )
131127
132128 # Avoid division by zero
133- if lengthW == 0 or lengthS == 0 :
129+ if length_w == 0 or length_s == 0 :
134130 return 0
135131
136- ARI = 4.71 * (lengthCH / lengthW ) + 0.5 * (lengthW / lengthS ) - 21.43
137- return ARI
132+ ari = 4.71 * (length_ch / length_w ) + 0.5 * (length_w / length_s ) - 21.43
133+ return ari
138134
139135
140136def smog (text ):
141137 # SMOG Index
142138 word_tokens = regexp_tokenize (text , r'\w+' )
143139 sents = sent_tokenize (text )
144- lengthS = len (sents )
140+ length_s = len (sents )
145141
146- if lengthS == 0 :
142+ if length_s == 0 :
147143 return 0
148144
149145 tps = 0 # total words with more than 2 syllables
150146 for word in word_tokens :
151147 if sylco (word ) > 2 :
152148 tps += 1
153149
154- SMOG = 1.043 * math .sqrt (tps * (30 / lengthS )) + 3.1291
150+ SMOG = 1.043 * math .sqrt (tps * (30 / length_s )) + 3.1291
155151 return SMOG
156152
157153
@@ -209,7 +205,7 @@ def transform(self, new_documents):
209205
210206 """
211207
212- if not type (new_documents ) == list :
208+ if type (new_documents ) is not list :
213209 new_documents .values .tolist ()
214210
215211 if self .verbose :
0 commit comments