-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathauxiliary_extraction_version2.py
More file actions
302 lines (239 loc) · 10.7 KB
/
auxiliary_extraction_version2.py
File metadata and controls
302 lines (239 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# author: Xiaomeng (Miranda) Zhu
# version 2 updated on Aug 19
# purpose: extracting morphosyntactic features of interest from txt file
import os
import pandas as pd # library for creating dataframes and storing into csv
import re
import spacy
# os.chdir('/Users/mirandazhu/Desktop/demo') # CHANGE ME: the folder where you store the txt file of the speaker
# global variables:
speakerID = "DCB_se1_ag2_m_02_3" # CHANGE ME: name of the txt file that you downloaded
speaker = "DCB_se1_ag2_m_02" # CHANGE ME: the speaker ID in the transcription
file_name = speakerID + ".txt"
# set up language model for part of speech tagging
nlp = spacy.load("en_core_web_sm")
# ========== Part 1: preprocessing ==========
def read_file(file_name):
with open(file_name) as f:
lines = f.readlines()
return lines
def filter_speakers(lines):
"""
return a list of lists of the form [Line, Spkr, StTime, Content, EnTime]
"""
filtered = [line.split("\t") for line in lines if ((line.split("\t")[1]) == speaker)]
return filtered
def get_all_sentences(lines):
# sentence content is the third element in the list
sents = [sent_list[3] for sent_list in lines]
return sents
# ========= Part 2: auxiliary extraction =========
def find_all_not_fully_contracted_N(sent):
BE_N = {"is not": 0, "are not": 0, "am not": 0, "'s not": 0, "'re not": 0, "'m not": 0, "not": 0}
HV_N = {"have not": 0, "has not": 0, "'ve not": 0, "'s not": 0}
DO_N = {"do not": 0, "does not": 0, "did not": 0}
# if this sentence contains a "not", tokenize this string using spacy and determine specific category
# regex meaning:
# not either preceded by a non alphanumeric character or begins at sentence boundary
# and either followed by a non alphanumeric character or ends at sentence boundary
if bool(re.search(r"(?:\W|^)not(?:\W|$)", sent)):
doc = nlp(sent)
tokens = [(token.text, token.pos_) for token in doc] # get all (token, pos) pairs
for idx, (token, pos) in enumerate(tokens):
if (token.lower() == "not") and (idx != 0):
# only go into this branch if the "not" token is not the first word of this sentence
# assumes that all "not" tokens that follow auxiliaries (including covert auxiliaries) cannot be the first word in a sentence
prev_token = tokens[idx-1][0].lower()
if prev_token in ["is", "are", "am"]:
BE_N[prev_token + " not"] += 1
elif prev_token in ["have", "has"]:
HV_N[prev_token + " not"] += 1
elif prev_token in ["do", "does", "did"]:
DO_N[prev_token + " not"] += 1
elif prev_token.endswith("'re"):
BE_N["'re not"] += 1
elif prev_token.endswith("'m"):
BE_N["'m not"] += 1
elif prev_token.endswith("'ve"):
HV_N["'ve not"] += 1
elif prev_token.endswith("'s"):
# 's could be a contracted is or has, so we need to check the part of speech of the following word
if idx != len(tokens)-1: # only precede to POS check if there is a next token
next_token_spacy = doc[idx+1]
next_pos = tokens[idx+1][1]
if (next_pos == "VERB") and (next_token_spacy.morph.get("Tense") != []) and (next_token_spacy.morph.get("Tense")[0]=="Past"): # 's is a contracted has
HV_N["'s not"] += 1
else: # 's is a contracted is
BE_N["'s not"] += 1
else: # not is not preceded with any auxiliaries
BE_N["not"] += 1
elif (token.lower() == "not") and (idx == 0): # if not is the first token in the sentence
BE_N["not"] += 1
return BE_N, HV_N, DO_N
def find_contracted_s(sent):
BE_P = {"'s": 0}
HV_P = {"'s": 0}
# the code below will only change keys that are 's in the above two dicts
if bool(re.search(r"'s\W(?!not)", sent, re.IGNORECASE)): # if this sentence contains a 's that is not followed by not
doc = nlp(sent)
tokens = [(token.text, token.pos_) for token in doc]
for idx, (token, pos) in enumerate(tokens):
if bool(re.search(r"'s(?:\W|$)", token, re.IGNORECASE)) and (idx != len(tokens) - 1):
# if there is a token following the token that ends with 's
next_token_spacy = doc[idx+1]
next_pos = tokens[idx+1][1]
if (next_pos == "VERB") and (next_token_spacy.morph.get("Tense") != []) and (next_token_spacy.morph.get("Tense")[0]=="Past"): # 's is a contracted has
HV_P["'s"] += 1
else: # 's is a contracted is
BE_P["'s"] += 1 # CAUTION: does not distinguish possessive 's
elif bool(re.search(r"'s(?:\W|$)", token, re.IGNORECASE)) and (idx == len(tokens) - 1):
# if the token containing 's is the last word of the sentence, categorize it as BE_P
BE_P["'s"] += 1
return BE_P, HV_P
def find_BE_P(sent):
# BE_P: is, are, am, 's, 're, 'm
BE_P = {
"is": 0,
"are": 0,
"am": 0,
"'re": 0,
"'m": 0
}
is_ = re.findall(r"(?:\W|^)is(?:\W|$)(?!not)", sent, re.IGNORECASE) # using this variable name because "is" is reserved in python
are = re.findall(r"(?:\W|^)are(?:\W|$)(?!not)", sent, re.IGNORECASE)
am = re.findall(r"(?:\W|^)am(?:\W|$)(?!not)", sent, re.IGNORECASE)
ap_re = re.findall(r"'re(?:\W|$)(?!not)", sent, re.IGNORECASE)
ap_m = re.findall(r"'m(?:\W|$)(?!not)", sent, re.IGNORECASE)
# res = ["is"] * len(is_) + ["are"] * len(are) + ["am"] * len(am) + ["'s"] * len(ap_s) + ["'re"] * len(ap_re) + ["'m"] + len(ap_m)
BE_P["is"] = len(is_)
BE_P["are"] = len(are)
BE_P["am"] = len(am)
BE_P["'re"] = len(ap_re)
BE_P["'m"] = len(ap_m)
return BE_P
def find_BE_fullycontracted_N(sent):
# BE_N:
# not fully contracted: is (not), are (not), am (not), 's (not), 're (not), 'm (not), not
# fully contracted: isn't, aren't
BE_N = {"isn't": 0, "aren't": 0}
isnt = re.findall(r"(?:\W|^)isn't(?:\W|$)", sent, re.IGNORECASE)
arent = re.findall(r"(?:\W|^)aren't(?:\W|$)", sent, re.IGNORECASE)
# res = ["isn't"] * len(isnt) + ["aren't"] * len(arent)
BE_N["isn't"] = len(isnt)
BE_N["aren't"] = len(arent)
return BE_N
def find_HV_P(sent):
HV_P = {
"have": 0, # does not distinguish possessive have vs. auxiliary have
"has": 0,
"'ve": 0,
}
have = re.findall(r"(?:\W|^)have(?:\W|$)(?!not)", sent, re.IGNORECASE)
has = re.findall(r"(?:\W|^)has(?:\W|$)(?!not)", sent, re.IGNORECASE)
ap_ve = re.findall(r"'ve(?:\W|$)(?!not)", sent, re.IGNORECASE)
# ap_s = re.findall(r"'s(?!\snot)", sent, re.IGNORECASE)
# res = ["have"] * len(have) + ["has"] * len(has) + ["'ve"] * len(ap_ve)
HV_P["have"] = len(have)
HV_P["has"] = len(has)
HV_P["'ve"] = len(ap_ve)
return HV_P
def find_HV_fullycontracted_N(sent):
# HV_N:
# not fully contracted: have (not), has (not), 've (not), 's (not)
# fully contracted: haven't, hasn't
HV_N = {"haven't": 0, "hasn't": 0}
havent = re.findall(r"(?:\W|^)haven't(?:\W|$)", sent, re.IGNORECASE)
hasnt = re.findall(r"(?:\W|^)hasn't(?:\W|$)", sent, re.IGNORECASE)
HV_N["haven't"] = len(havent)
HV_N["hasn't"] = len(hasnt)
return HV_N
def find_DO_P(sent):
# DO_P: do, does, did
DO_P = {
"do": 0,
"does": 0,
"did": 0,
}
do = re.findall(r"(?:\W|^)do(?:\W|$)(?!not)", sent, re.IGNORECASE)
does = re.findall(r"(?:\W|^)does(?:\W|$)(?!not)", sent, re.IGNORECASE)
did = re.findall(r"(?:\W|^)did(?:\W|$)(?!not)", sent, re.IGNORECASE)
DO_P["do"] = len(do)
DO_P["does"] = len(does)
DO_P["did"] = len(did)
return DO_P
def find_DO_fullycontracted_N(sent):
# DO_N:
# not fully contracted: do (not), does (not), did (not)
# fully contracted: don't, doesn't, didn't
DO_N = {
"don't": 0,
"doesn't": 0,
"didn't": 0
}
dont = re.findall(r"(?:\W|^)don't(?:\W|$)", sent, re.IGNORECASE)
doesnt = re.findall(r"(?:\W|^)doesn't(?:\W|$)", sent, re.IGNORECASE)
didnt = re.findall(r"(?:\W|^)didn't(?:\W|$)", sent, re.IGNORECASE)
DO_N["don't"] = len(dont)
DO_N["doesn't"] = len(doesnt)
DO_N["didn't"] = len(didnt)
return DO_N
def find_AI(sent):
# AI: ain't
res = re.findall(r"(?:\W|^)ain't(?:\W|$)", sent, re.IGNORECASE)
return {"ain't": len(res)}
def collect_all_aux(sent):
# negative constructions
BE_N, HV_N, DO_N = find_all_not_fully_contracted_N(sent)
BE_N_2 = find_BE_fullycontracted_N(sent)
HV_N_2 = find_HV_fullycontracted_N(sent)
DO_N_2 = find_DO_fullycontracted_N(sent)
BE_N.update(BE_N_2)
HV_N.update(HV_N_2)
DO_N.update(DO_N_2)
# positive constructions
BE_P, HV_P = find_contracted_s(sent)
BE_P.update(find_BE_P(sent))
HV_P.update(find_HV_P(sent))
DO_P = find_DO_P(sent)
AI = find_AI(sent)
return BE_N, HV_N, DO_N, BE_P, HV_P, DO_P, AI
def main():
res = []
lines = read_file(file_name)
filtered = filter_speakers(lines)
all_sents = get_all_sentences(filtered)
# for each sentence with auxiliary, append category and token
for idx, sent in enumerate(all_sents):
BE_N, HV_N, DO_N, BE_P, HV_P, DO_P, AI = collect_all_aux(sent)
master_dict = {
"BE_N": BE_N,
"HV_N": HV_N,
"DO_N": DO_N,
"BE_P": BE_P,
"HV_P": HV_P,
"DO_P": DO_P,
"AI": AI,
}
for category_name in master_dict:
category_dict = master_dict[category_name]
for token in category_dict:
count = category_dict[token]
if count != 0:
for i in range(count):
row = [] + filtered[idx]
row.append(category_name)
row.append(token)
res.append(row)
res_df = pd.DataFrame(res)
res_df.columns =['Line', 'Spkr', 'StTime', 'Content', 'EnTime', 'Label', 'Aux']
res_df.to_csv(speakerID+"_new.csv")
if __name__ == "__main__":
# doc = nlp("that's why I said that it was kind of sad that it's changing because")
# sent = "Not everyone has not been is not"
# print(find_all_not_fully_contracted_N(sent))
# print(collect_all_aux(sent))
# token = doc[-2] # 'I'
# # print(token.text)
# # print(token.morph) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
# print(token.morph.get("Tense")) # ['Prs']
main()