-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathworking_with_corpus.py
More file actions
61 lines (53 loc) · 2.19 KB
/
working_with_corpus.py
File metadata and controls
61 lines (53 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def build_paradigm(noun, paradigm, suffix, form=None):
if suffix != "":
root = noun[:-len(suffix)]
else:
root = noun
if form:
lemma = form
if not suffix:
root = lemma
form_root = noun
else:
root = lemma[:-len(suffix)]
form_root = noun[:-len(suffix)]
noun_par = paradigm.replace("ROOT", root)
noun_par = noun_par.replace("LEMMA", lemma)
noun_par = noun_par.replace("FORM", form_root)
return noun_par
noun_par = paradigm.replace("ROOT", root)
noun_par = noun_par.replace("LEMMA", noun)
return noun_par
pupulazzioni = ' <e lm="LEMMA"><i>ROOT</i><par n="pupulaz/zioni__n"/></e>'
annu = ' <e lm="LEMMA"><i>ROOT</i><par n="ann/u__n"/></e>'
casa = ' <e lm="LEMMA"><i>ROOT</i><par n="cas/a__n"/></e>'
mancari = ' <e lm="LEMMA"><i>ROOT</i><par n="manc/ari__vblex"/></e>'
parrari = ' <e lm="LEMMA"><i>ROOT</i><par n="parr/ari__vblex"/></e>'
battiri = ' <e lm="LEMMA"><i>ROOT</i><par n="batt/iri__vblex"/></e>'
filusufia = ' <e lm="LEMMA"><i>ROOT</i><par n="filusuf/ìa__n"/></e>'
tirritoriu = ' <e lm="LEMMA"><i>ROOT</i><par n="tirritori/u__n"/></e>'
freq_dictionary = dict()
with open("scn.crp.txt", 'r', encoding="utf-8") as corpus:
for line in corpus:
line = line.strip()
line = line.split(" ")
for word in line:
if "'" in word:
word = word.split("'")[1]
elif "’" in word:
word = word.split("’")[1]
if word.isalpha():
word = word.lower()
word = word.strip()
for symbol in [".", ',','"', "'", ":", "\"", ";", "\"", "!", "?", "=", "-", ")", '(']:
word = word.strip(symbol)
freq_dictionary[word] = freq_dictionary.get(word, 0) + 1
entries = []
with open("scn_dix.txt", 'r', encoding="utf-8") as dictionary:
dictionary = dictionary.read()
for key in freq_dictionary:
if key.endswith("iu") and "lm=\"" + key not in dictionary:
if freq_dictionary[key] > 5:
entries.append(build_paradigm(key, tirritoriu, "u"))
for e in sorted(entries):
print(e)