Skip to content

Commit f676fbb

Browse files
committed
Add FR lemmas missing from the training data, also provided by Prof. Guy Lapalme
1 parent 1806a72 commit f676fbb

File tree

3 files changed

+9076
-0
lines changed

3 files changed

+9076
-0
lines changed

french-lemmas/check_lemmas.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Reads in the training document for the Stanza French lemmatizer and
2+
# looks for verbs which have expected lemmas in a list provided to us
3+
# by Prof. Guy Lapalme, but were not in the training data.
4+
#
5+
# If this script is rerun after Stanza 1.10, be sure not to clobber
6+
# the output file but rather append newly found missing lemmas
7+
8+
import stanza
9+
10+
# TODO: could make these arguments to an argparse
11+
fr_words_file = "output_fr.txt"
12+
conllu_file = "/home/john/stanza/data/lemma/fr_combined.train.in.conllu"
13+
output_file = "fr_lemmas.conllu"
14+
15+
from stanza.utils.conll import CoNLL
16+
doc = CoNLL.conll2doc(conllu_file)
17+
18+
known_verbs = {word.text: word.lemma for sentence in doc.sentences for word in sentence.words if word.upos == 'VERB'}
19+
print(len(known_verbs))
20+
21+
with open(fr_words_file, encoding="utf-8") as fin:
22+
lines = fin.readlines()
23+
for idx, line in enumerate(lines):
24+
if line.startswith("Règles et lexique"):
25+
break
26+
else:
27+
raise ValueError("Unexpected file format")
28+
lines = lines[idx:]
29+
lines = [line.split(":", maxsplit=3) for line in lines]
30+
lines = [line for line in lines if len(line) == 4]
31+
32+
new_training_lines = []
33+
pipe = stanza.Pipeline("fr", processors="tokenize,pos,lemma")
34+
for line in lines:
35+
input_text = line[0].strip()
36+
expected_lemma = line[2].strip()
37+
doc = pipe(input_text)
38+
if len(doc.sentences) > 1:
39+
raise ValueError("Error in number of sentences! |%s|" % input_text)
40+
if len(doc.sentences[0].words) > 2:
41+
raise ValueError("Error in number of words! |%s|" % input_text)
42+
43+
verb = doc.sentences[0].words[1].text
44+
#if verb in known_verbs and expected_lemma != known_verbs[verb]:
45+
# print("Unexpected labeling of %s: %s" % (verb, known_verbs[verb]))
46+
output_lemma = doc.sentences[0].words[1].lemma
47+
if output_lemma != expected_lemma:
48+
#print(input_text, verb, output_lemma, expected_lemma)
49+
if verb in known_verbs:
50+
print("Unexpected error for %s" % verb)
51+
else:
52+
new_training_lines.append((verb, expected_lemma))
53+
54+
with open(output_file, "w", encoding="utf-8") as fout:
55+
for sent_id, (verb, expected_lemma) in enumerate(new_training_lines):
56+
print(verb, expected_lemma)
57+
fout.write("# sent_id = missing_lemma_%4d\n" % sent_id)
58+
fout.write("# text = %s\n" % verb)
59+
fout.write("1\t%s\t%s\tVERB\t_\t_\t0\troot\t_\t_\n\n" % (verb, expected_lemma))
60+

0 commit comments

Comments
 (0)