-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdropmissingipa.py
More file actions
120 lines (67 loc) · 2.93 KB
/
dropmissingipa.py
File metadata and controls
120 lines (67 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""read raw files, ignore files with no English borrowings and missing IPA-transcriptions"""
import os
from functools import cache
from espeakng import ESpeakNG
import pandas as pd
from tqdm import tqdm
from loanpy import helpers
REPO = os.path.dirname(os.getcwd())
hp = helpers.Help()
def en2ipa(df):
def no_tiebar_4vow(func, word, ipa):
def removebar(func, word, ipa):
#print(word)
ipaword = func(word.replace("-",""), ipa)
ipa_cln= []
for nr, i in enumerate(ipaword):
if i != chr(865):
ipa_cln.append(i)
elif hp.phon2cv.get(ipaword[nr-1], "") == "V" or hp.phon2cv.get(ipaword[nr+1], "") == "V":
continue
else:
ipa_cln.append(i)
ipa_cln = "".join(ipa_cln)
return ipa_cln
return removebar(func, word, ipa)
@cache
def g(word, ipa):
return no_tiebar_4vow(esng.g2p, word, ipa)
esng = ESpeakNG()
esng.voice ="en-us"
ipalist = []
for word in tqdm(df["L2_etym"]):
if not isinstance(word, float):
try:
ipalist.append(g(word, ipa=2))
except UnicodeDecodeError:
print(word)
ipalist.append(None)
pass
else:
ipalist.append(None)
df["en_ipa"] = ipalist
return df
for folder in ["raw2", "raw1"]:
for file in os.listdir(os.path.join(REPO, folder)):
print(file)
if os.path.exists(file):
#print("skip")
continue
df = pd.read_csv(f"{REPO}\{folder}\{file}")
if len(df.dropna(subset=["L2_etym"])) == 0:
continue
df = df.dropna(subset=["L2_ipa"])
if len(df.dropna(subset=["L2_etym"])) == 0:
continue
en2ipa(df)
print(f"writing {file}")
df.to_csv(file, encoding="utf-8", index=False)
#check fuckups in encoding
df = pd.read_csv(file).dropna(subset=["L2_orth", "L2_ipa"])
df["en_ipa"] = ["DELETETHIS" if not isinstance(etym, float) and isinstance(ipa, float) else ipa
for etym,ipa in zip(df["L2_etym"], df["en_ipa"])]
df = df[df["en_ipa"]!="DELETETHIS"]
if len(df) != 0:
df.to_csv(file, encoding="utf-8", index=False)
else:
os.remove(file)