|
| 1 | +# Reads in the training document for the Stanza French lemmatizer and |
| 2 | +# looks for verbs which have expected lemmas in a list provided to us |
| 3 | +# by Prof. Guy Lapalme, but were not in the training data. |
| 4 | +# |
| 5 | +# If this script is rerun after Stanza 1.10, be sure not to clobber |
| 6 | +# the output file but rather append newly found missing lemmas |
| 7 | + |
| 8 | +import stanza |
| 9 | + |
| 10 | +# TODO: could make these arguments to an argparse |
| 11 | +fr_words_file = "output_fr.txt" |
| 12 | +conllu_file = "/home/john/stanza/data/lemma/fr_combined.train.in.conllu" |
| 13 | +output_file = "fr_lemmas.conllu" |
| 14 | + |
| 15 | +from stanza.utils.conll import CoNLL |
| 16 | +doc = CoNLL.conll2doc(conllu_file) |
| 17 | + |
| 18 | +known_verbs = {word.text: word.lemma for sentence in doc.sentences for word in sentence.words if word.upos == 'VERB'} |
| 19 | +print(len(known_verbs)) |
| 20 | + |
| 21 | +with open(fr_words_file, encoding="utf-8") as fin: |
| 22 | + lines = fin.readlines() |
| 23 | +for idx, line in enumerate(lines): |
| 24 | + if line.startswith("Règles et lexique"): |
| 25 | + break |
| 26 | +else: |
| 27 | + raise ValueError("Unexpected file format") |
| 28 | +lines = lines[idx:] |
| 29 | +lines = [line.split(":", maxsplit=3) for line in lines] |
| 30 | +lines = [line for line in lines if len(line) == 4] |
| 31 | + |
| 32 | +new_training_lines = [] |
| 33 | +pipe = stanza.Pipeline("fr", processors="tokenize,pos,lemma") |
| 34 | +for line in lines: |
| 35 | + input_text = line[0].strip() |
| 36 | + expected_lemma = line[2].strip() |
| 37 | + doc = pipe(input_text) |
| 38 | + if len(doc.sentences) > 1: |
| 39 | + raise ValueError("Error in number of sentences! |%s|" % input_text) |
| 40 | + if len(doc.sentences[0].words) > 2: |
| 41 | + raise ValueError("Error in number of words! |%s|" % input_text) |
| 42 | + |
| 43 | + verb = doc.sentences[0].words[1].text |
| 44 | + #if verb in known_verbs and expected_lemma != known_verbs[verb]: |
| 45 | + # print("Unexpected labeling of %s: %s" % (verb, known_verbs[verb])) |
| 46 | + output_lemma = doc.sentences[0].words[1].lemma |
| 47 | + if output_lemma != expected_lemma: |
| 48 | + #print(input_text, verb, output_lemma, expected_lemma) |
| 49 | + if verb in known_verbs: |
| 50 | + print("Unexpected error for %s" % verb) |
| 51 | + else: |
| 52 | + new_training_lines.append((verb, expected_lemma)) |
| 53 | + |
| 54 | +with open(output_file, "w", encoding="utf-8") as fout: |
| 55 | + for sent_id, (verb, expected_lemma) in enumerate(new_training_lines): |
| 56 | + print(verb, expected_lemma) |
| 57 | + fout.write("# sent_id = missing_lemma_%4d\n" % sent_id) |
| 58 | + fout.write("# text = %s\n" % verb) |
| 59 | + fout.write("1\t%s\t%s\tVERB\t_\t_\t0\troot\t_\t_\n\n" % (verb, expected_lemma)) |
| 60 | + |
0 commit comments