Skip to content

Commit 3375804

Browse files
authored
Merge pull request #9 from SkBlaz/aa_ftrs
Feature Reading Comperhension features
2 parents 55d0716 + 77380c7 commit 3375804

File tree

2 files changed

+264
-5
lines changed

2 files changed

+264
-5
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
import math
2+
import re
3+
import pandas as pd
4+
import numpy as np
5+
import tqdm
6+
import nltk
7+
from nltk import sent_tokenize, regexp_tokenize
8+
9+
import logging
10+
11+
logging.basicConfig(format='%(asctime)s - %(message)s',
12+
datefmt='%d-%b-%y %H:%M:%S')
13+
logging.getLogger().setLevel(logging.INFO)
14+
15+
def sylco(word):
16+
word = word.lower()
17+
18+
syls = 0 # added syllable number
19+
disc = 0 # discarded syllable number
20+
21+
# 1) if letters < 3 : return 1
22+
if len(word) <= 3:
23+
syls = 1
24+
return syls
25+
26+
# 2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
27+
# if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
28+
# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
29+
double_and_triple = len(re.findall(r'[eaoui][eaoui]', word))
30+
tripple = len(re.findall(r'[eaoui][eaoui][eaoui]', word))
31+
disc += double_and_triple + tripple
32+
33+
# 5) count remaining vowels in word.
34+
num_vowels = len(re.findall(r'[eaoui]', word))
35+
36+
# 9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
37+
if word[:3] == "tri" and len(word) > 3 and word[3] in "aeoui":
38+
syls += 1
39+
40+
if word[:2] == "bi" and len(word) > 2 and word[2] in "aeoui":
41+
syls += 1
42+
43+
# 10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
44+
# 13) check for "-n't" and cross match with dictionary to add syllable.
45+
# (These rules would be added if needed.)
46+
47+
# calculate the output
48+
return num_vowels - disc + syls
49+
50+
51+
def gfi(text):
52+
# Gunning Fog Index
53+
word_tokens = regexp_tokenize(text, r'\w+')
54+
length_w = len(word_tokens)
55+
sents = sent_tokenize(text)
56+
length_s = len(sents)
57+
58+
if length_s == 0 or length_w == 0:
59+
return 0
60+
61+
long_words = [w for w in word_tokens if len(w) > 7]
62+
pl = len(long_words) / length_w * 100 # percentage long words
63+
gfi = 0.4 * ((length_w / length_s) + pl)
64+
return gfi
65+
66+
67+
def fre(text):
68+
# Flesch Reading Ease
69+
word_tokens = regexp_tokenize(text, r'\w+')
70+
length_w = len(word_tokens)
71+
sents = sent_tokenize(text)
72+
length_s = len(sents)
73+
74+
if length_s == 0 or length_w == 0:
75+
return 0
76+
77+
ts = 0 # total syllables
78+
for word in word_tokens:
79+
ts += sylco(word)
80+
81+
fre = 206.835 - 1.015 * (length_w / length_s) - 84.6 * (ts / length_w)
82+
return fre
83+
84+
85+
def fkgl(text):
86+
# Flesch–Kincaid Grade Level
87+
word_tokens = regexp_tokenize(text, r'\w+')
88+
length_w = len(word_tokens)
89+
sents = sent_tokenize(text)
90+
length_s = len(sents)
91+
92+
if length_s == 0 or length_w == 0:
93+
return 0
94+
95+
ts = 0 # total syllables
96+
for word in word_tokens:
97+
ts += sylco(word)
98+
99+
fkgl = 0.39 * (length_w / length_s) + 11.8 * (ts / length_w) - 15.59
100+
return fkgl
101+
102+
103+
def dcrf(text):
104+
# Dale–Chall Readability Formula
105+
word_tokens = regexp_tokenize(text, r'\w+')
106+
length_w = len(word_tokens)
107+
sents = sent_tokenize(text)
108+
length_s = len(sents)
109+
110+
if length_s == 0 or length_w == 0:
111+
return 0
112+
113+
long_words = [w for w in word_tokens if len(w) > 7]
114+
pl = len(long_words) / length_w * 100 # percentage of long words
115+
116+
dcrf = 0.1579 * pl + 0.0496 * (length_w / length_s)
117+
return dcrf
118+
119+
120+
def ari(text):
121+
# Automated Readability Index
122+
word_tokens = regexp_tokenize(text, r'\w+')
123+
length_w = len(word_tokens)
124+
sents = sent_tokenize(text)
125+
length_s = len(sents)
126+
length_ch = len(text)
127+
128+
# Avoid division by zero
129+
if length_w == 0 or length_s == 0:
130+
return 0
131+
132+
ari = 4.71 * (length_ch / length_w) + 0.5 * (length_w / length_s) - 21.43
133+
return ari
134+
135+
136+
def smog(text):
137+
# SMOG Index
138+
word_tokens = regexp_tokenize(text, r'\w+')
139+
sents = sent_tokenize(text)
140+
length_s = len(sents)
141+
142+
if length_s == 0:
143+
return 0
144+
145+
tps = 0 # total words with more than 2 syllables
146+
for word in word_tokens:
147+
if sylco(word) > 2:
148+
tps += 1
149+
150+
SMOG = 1.043 * math.sqrt(tps * (30 / length_s)) + 3.1291
151+
return SMOG
152+
153+
154+
def sent_len(text):
155+
sents = nltk.sent_tokenize(text)
156+
if not sents:
157+
return 0
158+
# Calculate average sentence length (in words)
159+
text_lens = [len(nltk.word_tokenize(sent)) for sent in sents]
160+
if len(text_lens) == 0:
161+
return 0
162+
return sum(text_lens) / len(text_lens)
163+
164+
165+
def ttr(text):
166+
# Type-Token Ratio (vocabulary diversity)
167+
words = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
168+
if len(words) == 0:
169+
return 0
170+
return len(set(words)) / len(words)
171+
172+
173+
class ComperhensionFeatures:
174+
def __init__(self,
175+
verbose=True):
176+
"""
177+
Class initialization method.
178+
179+
:param verbose: Whether to have the printouts
180+
181+
"""
182+
self.verbose = verbose
183+
self.features = {"gfi": gfi,
184+
"fre": fre, "fkgl": fkgl,
185+
"dcrf": dcrf, "ari": ari,
186+
"smog": smog, "sent_len": sent_len,
187+
"ttr": ttr}
188+
self.ndim = len(self.features)
189+
190+
def fit(self, text_list):
191+
"""
192+
The fit method.
193+
194+
:param text_list: List of input texts
195+
196+
"""
197+
pass
198+
199+
def transform(self, new_documents):
200+
"""
201+
Transform method.
202+
203+
:param new_documents: The new set of documents to be transformed.
204+
:return all_embeddings: The final embedding matrix
205+
206+
"""
207+
208+
if type(new_documents) is not list:
209+
new_documents.values.tolist()
210+
211+
if self.verbose:
212+
logging.info("[Comperhension Features] Transforming new documents.")
213+
214+
new_features = np.zeros((len(new_documents), self.ndim))
215+
for enx, doc in tqdm.tqdm(enumerate(new_documents),
216+
total=len(new_documents)):
217+
for mid, method in enumerate(self.features):
218+
value = self.features[method](doc)
219+
new_features[mid] = value
220+
221+
return new_features
222+
223+
def fit_transform(self, documents, b=None):
224+
"""
225+
The sklearn-like fit-transform method.
226+
227+
"""
228+
229+
self.fit(documents)
230+
return self.transform(documents)
231+
232+
def get_feature_names_out(self):
233+
"""
234+
Get feature names.
235+
"""
236+
237+
return list(self.features.keys())
238+
239+
240+
if __name__ == "__main__":
241+
242+
example_text = pd.read_csv("../../data/insults/train.tsv", sep="\t")['text_a']
243+
labels = pd.read_csv("../../data/insults/train.tsv",
244+
sep="\t")['label'].values.tolist()
245+
clx = ComperhensionFeatures()
246+
sim_features = clx.fit_transform(example_text)
247+
248+
print(clx.get_feature_names_out())

autoBOTLib/optimization/optimization_feature_constructors.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from autoBOTLib.features.features_token_relations import *
2020
from autoBOTLib.features.features_contextual import *
2121
from autoBOTLib.features.features_images import *
22+
from autoBOTLib.features.features_reading_comperhension import *
2223

2324
import string
2425
import re
@@ -61,22 +62,22 @@ def PerceptronTagger():
6162
'concept_features', 'document_graph', 'relational_features_token',
6263
'topic_features', 'keyword_features', 'relational_features_char',
6364
'char_features', 'word_features', 'relational_features_bigram',
64-
'contextual_features'
65+
'contextual_features', 'reading_features'
6566
]
6667

6768
# This one is ~language agnostic
6869
feature_presets['neurosymbolic-lite'] = [
6970
'document_graph', 'neural_features_dbow', 'neural_features_dm',
7071
'topic_features', 'keyword_features', 'relational_features_char',
7172
'relational_features_token', 'char_features', 'word_features',
72-
'relational_features_bigram', 'concept_features'
73+
'relational_features_bigram', 'concept_features', 'reading_features'
7374
]
7475

7576
# MLJ paper versions
7677
feature_presets['neurosymbolic-default'] = [
7778
'neural_features_dbow', 'neural_features_dm', 'keyword_features',
7879
'relational_features_char', 'char_features', 'word_features',
79-
"pos_features", 'concept_features'
80+
"pos_features", 'concept_features', 'reading_features'
8081
]
8182

8283
feature_presets['neural'] = [
@@ -86,7 +87,7 @@ def PerceptronTagger():
8687
feature_presets['symbolic'] = [
8788
'concept_features', 'relational_features_token', 'topic_features',
8889
'keyword_features', 'relational_features_char', 'char_features',
89-
'word_features', 'pos_features', 'relational_features_bigram'
90+
'word_features', 'pos_features', 'relational_features_bigram', 'reading_features'
9091
]
9192

9293
if not contextual_feature_library:
@@ -464,6 +465,8 @@ def get_features(df_data,
464465
keyword_features = KeywordFeatures(max_features=max_num_feat,
465466
targets=targets)
466467

468+
reading_features = ComperhensionFeatures()
469+
467470
topic_features = TopicDocs(ndim=embedding_dim)
468471

469472
concept_features_transformer = ConceptFeatures(
@@ -570,7 +573,15 @@ def get_features(df_data,
570573
contextual_features),
571574
('normalize',
572575
Normalizer(norm=normalization_norm))
573-
]))
576+
])),
577+
"reading_features": ("reading_features", pipeline.Pipeline([
578+
('s7', text_col(key='text')),
579+
('reading_features',
580+
reading_features),
581+
('normalize',
582+
Normalizer(norm=normalization_norm))
583+
]
584+
))
574585
}
575586

576587
if include_image_transformer:

0 commit comments

Comments
 (0)