|
| 1 | +import math |
| 2 | +import re |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +import tqdm |
| 6 | +import nltk |
| 7 | +from nltk import sent_tokenize, regexp_tokenize |
| 8 | + |
| 9 | +import logging |
| 10 | + |
| 11 | +logging.basicConfig(format='%(asctime)s - %(message)s', |
| 12 | + datefmt='%d-%b-%y %H:%M:%S') |
| 13 | +logging.getLogger().setLevel(logging.INFO) |
| 14 | + |
| 15 | +def sylco(word): |
| 16 | + word = word.lower() |
| 17 | + |
| 18 | + syls = 0 # added syllable number |
| 19 | + disc = 0 # discarded syllable number |
| 20 | + |
| 21 | + # 1) if letters < 3 : return 1 |
| 22 | + if len(word) <= 3: |
| 23 | + syls = 1 |
| 24 | + return syls |
| 25 | + |
| 26 | + # 2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end. |
| 27 | + # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.) |
| 28 | + # 4) check if consecutive vowels exists, triplets or pairs, count them as one. |
| 29 | + double_and_triple = len(re.findall(r'[eaoui][eaoui]', word)) |
| 30 | + tripple = len(re.findall(r'[eaoui][eaoui][eaoui]', word)) |
| 31 | + disc += double_and_triple + tripple |
| 32 | + |
| 33 | + # 5) count remaining vowels in word. |
| 34 | + num_vowels = len(re.findall(r'[eaoui]', word)) |
| 35 | + |
| 36 | + # 9) if starts with "tri-" or "bi-" and is followed by a vowel, add one. |
| 37 | + if word[:3] == "tri" and len(word) > 3 and word[3] in "aeoui": |
| 38 | + syls += 1 |
| 39 | + |
| 40 | + if word[:2] == "bi" and len(word) > 2 and word[2] in "aeoui": |
| 41 | + syls += 1 |
| 42 | + |
| 43 | + # 10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian" |
| 44 | + # 13) check for "-n't" and cross match with dictionary to add syllable. |
| 45 | + # (These rules would be added if needed.) |
| 46 | + |
| 47 | + # calculate the output |
| 48 | + return num_vowels - disc + syls |
| 49 | + |
| 50 | + |
| 51 | +def gfi(text): |
| 52 | + # Gunning Fog Index |
| 53 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 54 | + length_w = len(word_tokens) |
| 55 | + sents = sent_tokenize(text) |
| 56 | + length_s = len(sents) |
| 57 | + |
| 58 | + if length_s == 0 or length_w == 0: |
| 59 | + return 0 |
| 60 | + |
| 61 | + long_words = [w for w in word_tokens if len(w) > 7] |
| 62 | + pl = len(long_words) / length_w * 100 # percentage long words |
| 63 | + gfi = 0.4 * ((length_w / length_s) + pl) |
| 64 | + return gfi |
| 65 | + |
| 66 | + |
| 67 | +def fre(text): |
| 68 | + # Flesch Reading Ease |
| 69 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 70 | + length_w = len(word_tokens) |
| 71 | + sents = sent_tokenize(text) |
| 72 | + length_s = len(sents) |
| 73 | + |
| 74 | + if length_s == 0 or length_w == 0: |
| 75 | + return 0 |
| 76 | + |
| 77 | + ts = 0 # total syllables |
| 78 | + for word in word_tokens: |
| 79 | + ts += sylco(word) |
| 80 | + |
| 81 | + fre = 206.835 - 1.015 * (length_w / length_s) - 84.6 * (ts / length_w) |
| 82 | + return fre |
| 83 | + |
| 84 | + |
| 85 | +def fkgl(text): |
| 86 | + # Flesch–Kincaid Grade Level |
| 87 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 88 | + length_w = len(word_tokens) |
| 89 | + sents = sent_tokenize(text) |
| 90 | + length_s = len(sents) |
| 91 | + |
| 92 | + if length_s == 0 or length_w == 0: |
| 93 | + return 0 |
| 94 | + |
| 95 | + ts = 0 # total syllables |
| 96 | + for word in word_tokens: |
| 97 | + ts += sylco(word) |
| 98 | + |
| 99 | + fkgl = 0.39 * (length_w / length_s) + 11.8 * (ts / length_w) - 15.59 |
| 100 | + return fkgl |
| 101 | + |
| 102 | + |
| 103 | +def dcrf(text): |
| 104 | + # Dale–Chall Readability Formula |
| 105 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 106 | + length_w = len(word_tokens) |
| 107 | + sents = sent_tokenize(text) |
| 108 | + length_s = len(sents) |
| 109 | + |
| 110 | + if length_s == 0 or length_w == 0: |
| 111 | + return 0 |
| 112 | + |
| 113 | + long_words = [w for w in word_tokens if len(w) > 7] |
| 114 | + pl = len(long_words) / length_w * 100 # percentage of long words |
| 115 | + |
| 116 | + dcrf = 0.1579 * pl + 0.0496 * (length_w / length_s) |
| 117 | + return dcrf |
| 118 | + |
| 119 | + |
| 120 | +def ari(text): |
| 121 | + # Automated Readability Index |
| 122 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 123 | + length_w = len(word_tokens) |
| 124 | + sents = sent_tokenize(text) |
| 125 | + length_s = len(sents) |
| 126 | + length_ch = len(text) |
| 127 | + |
| 128 | + # Avoid division by zero |
| 129 | + if length_w == 0 or length_s == 0: |
| 130 | + return 0 |
| 131 | + |
| 132 | + ari = 4.71 * (length_ch / length_w) + 0.5 * (length_w / length_s) - 21.43 |
| 133 | + return ari |
| 134 | + |
| 135 | + |
| 136 | +def smog(text): |
| 137 | + # SMOG Index |
| 138 | + word_tokens = regexp_tokenize(text, r'\w+') |
| 139 | + sents = sent_tokenize(text) |
| 140 | + length_s = len(sents) |
| 141 | + |
| 142 | + if length_s == 0: |
| 143 | + return 0 |
| 144 | + |
| 145 | + tps = 0 # total words with more than 2 syllables |
| 146 | + for word in word_tokens: |
| 147 | + if sylco(word) > 2: |
| 148 | + tps += 1 |
| 149 | + |
| 150 | + SMOG = 1.043 * math.sqrt(tps * (30 / length_s)) + 3.1291 |
| 151 | + return SMOG |
| 152 | + |
| 153 | + |
| 154 | +def sent_len(text): |
| 155 | + sents = nltk.sent_tokenize(text) |
| 156 | + if not sents: |
| 157 | + return 0 |
| 158 | + # Calculate average sentence length (in words) |
| 159 | + text_lens = [len(nltk.word_tokenize(sent)) for sent in sents] |
| 160 | + if len(text_lens) == 0: |
| 161 | + return 0 |
| 162 | + return sum(text_lens) / len(text_lens) |
| 163 | + |
| 164 | + |
| 165 | +def ttr(text): |
| 166 | + # Type-Token Ratio (vocabulary diversity) |
| 167 | + words = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] |
| 168 | + if len(words) == 0: |
| 169 | + return 0 |
| 170 | + return len(set(words)) / len(words) |
| 171 | + |
| 172 | + |
| 173 | +class ComperhensionFeatures: |
| 174 | + def __init__(self, |
| 175 | + verbose=True): |
| 176 | + """ |
| 177 | + Class initialization method. |
| 178 | +
|
| 179 | + :param verbose: Whether to have the printouts |
| 180 | + |
| 181 | + """ |
| 182 | + self.verbose = verbose |
| 183 | + self.features = {"gfi": gfi, |
| 184 | + "fre": fre, "fkgl": fkgl, |
| 185 | + "dcrf": dcrf, "ari": ari, |
| 186 | + "smog": smog, "sent_len": sent_len, |
| 187 | + "ttr": ttr} |
| 188 | + self.ndim = len(self.features) |
| 189 | + |
| 190 | + def fit(self, text_list): |
| 191 | + """ |
| 192 | + The fit method. |
| 193 | +
|
| 194 | + :param text_list: List of input texts |
| 195 | + |
| 196 | + """ |
| 197 | + pass |
| 198 | + |
| 199 | + def transform(self, new_documents): |
| 200 | + """ |
| 201 | + Transform method. |
| 202 | +
|
| 203 | + :param new_documents: The new set of documents to be transformed. |
| 204 | + :return all_embeddings: The final embedding matrix |
| 205 | + |
| 206 | + """ |
| 207 | + |
| 208 | + if type(new_documents) is not list: |
| 209 | + new_documents.values.tolist() |
| 210 | + |
| 211 | + if self.verbose: |
| 212 | + logging.info("[Comperhension Features] Transforming new documents.") |
| 213 | + |
| 214 | + new_features = np.zeros((len(new_documents), self.ndim)) |
| 215 | + for enx, doc in tqdm.tqdm(enumerate(new_documents), |
| 216 | + total=len(new_documents)): |
| 217 | + for mid, method in enumerate(self.features): |
| 218 | + value = self.features[method](doc) |
| 219 | + new_features[mid] = value |
| 220 | + |
| 221 | + return new_features |
| 222 | + |
| 223 | + def fit_transform(self, documents, b=None): |
| 224 | + """ |
| 225 | + The sklearn-like fit-transform method. |
| 226 | +
|
| 227 | + """ |
| 228 | + |
| 229 | + self.fit(documents) |
| 230 | + return self.transform(documents) |
| 231 | + |
| 232 | + def get_feature_names_out(self): |
| 233 | + """ |
| 234 | + Get feature names. |
| 235 | + """ |
| 236 | + |
| 237 | + return list(self.features.keys()) |
| 238 | + |
| 239 | + |
| 240 | +if __name__ == "__main__": |
| 241 | + |
| 242 | + example_text = pd.read_csv("../../data/insults/train.tsv", sep="\t")['text_a'] |
| 243 | + labels = pd.read_csv("../../data/insults/train.tsv", |
| 244 | + sep="\t")['label'].values.tolist() |
| 245 | + clx = ComperhensionFeatures() |
| 246 | + sim_features = clx.fit_transform(example_text) |
| 247 | + |
| 248 | + print(clx.get_feature_names_out()) |
0 commit comments