-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstats.py
More file actions
53 lines (49 loc) · 2.08 KB
/
stats.py
File metadata and controls
53 lines (49 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# remaking the GAM from 'Modeling Language Change in English First Names'
import pickle, os
import pandas
from name_processing import annotate_name
from pygam import LogisticGAM, s, f, te
path_to_data = "./data/"
listed_files = os.listdir(path_to_data)
data_frame = pandas.DataFrame(columns=["spelling"
,"sex"
,"count",
"decade",
"stress",
"syll_count",
"ends_in_vowel",
"initial_vowel"])
for file in listed_files:
with open(path_to_data + file) as fi:
if file[6] != "0":
continue
for line in fi:
line = line.split(",")
annotations = annotate_name(line[0])
if annotations == "NAME ERROR":
continue
data_frame.loc[len(data_frame)] = {
"spelling": line[0],
"sex": 1 if line[1] == "F" else 0,
"count": int(line[2][:-1]),
"decade": int(file[3:6]+"0"),
"stress": annotations["stress"],
"syll_count": annotations["syll_count"],
"ends_in_vowel": annotations["ends_in_vowel"],
"initial_vowel": annotations["initial_vowel"]
}
'''using the dependent variable of sex,
and the independent variables of stress (reference level = Primary),
final phoneme (reference = C), initial
vowel (reference = [æ]),
and number of syllables (treated categorically; reference = 1),
plus decade as a non-parametric smooth term, interacted with syllable count'''
terms = ( s(0) + f(1) + f(2) + f(3) + f(4) + te(0, 2))
X = data_frame[["decade", "stress", "syll_count", "ends_in_vowel", "initial_vowel"]]
Y = data_frame["sex"]
gam = LogisticGAM(terms=terms).fit(X, Y)
print(gam.summary())
with open('gam.pkl', 'wb') as f:
pickle.dump(gam, f)
with open('training_data.pkl', 'wb') as f:
pickle.dump(data_frame, f)