Add some one word fake sentences with missing lemmas for Stanza

AngledLuffa · AngledLuffa · commit 1806a720bf46 · 2024-11-13T09:39:32.000-08:00
diff --git a/english-lemmas/check_lemmas.py b/english-lemmas/check_lemmas.py
@@ -0,0 +1,34 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--lemmas', default="en_lemmas.txt", type=str, help='Where to find the lemmas to look for')
+parser.add_argument('--conllu', default="/home/john/stanza/data/lemma/en_combined.train.in.conllu", type=str, help='Where to find the training data to check')
+args = parser.parse_args()
+
+lemma_file = args.lemmas
+conllu_file = args.conllu
+
+with open(lemma_file) as fin:
+    lines = fin.readlines()
+
+lemmas = [x.strip().split() for x in lines]
+
+with open(conllu_file) as fin:
+    lines = fin.readlines()
+    lines = [x.strip() for x in lines]
+
+for lemma in lemmas:
+    text = "\t%s\t" % lemma[0]
+    print("Looking for %s" % text)
+    for line in lines:
+        if text in line and "VERB" in line:
+            print(line)
+
+print("===================")
+
+for lemma in lemmas:
+    text = "\t%s\t" % lemma[0]
+    print("Looking for %s" % text)
+    for line in lines:
+        if text in line and "VERB" not in line:
+            print(line)
diff --git a/english-lemmas/en_lemmas.conllu b/english-lemmas/en_lemmas.conllu
@@ -0,0 +1,136 @@
+# sent_id = missing_lemma_0000
+# text = acquires
+1	acquires	acquire	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0001
+# text = analyses
+1	analyses	analyze	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0002
+# text = bolt
+1	bolt	bolt	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0003
+# text = buys
+1	buys	buy	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0004
+# text = diagnoses
+1	diagnoses	diagnose	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0005
+# text = discloses
+1	discloses	disclose	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0006
+# text = disguises
+1	disguises	disguise	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0007
+# text = dose
+1	dose	dose	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0008
+# text = downs
+1	downs	down	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0009
+# text = echoes
+1	echoes	echo	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0010
+# text = eschewing
+1	eschewing	eschew	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0011
+# text = fade
+1	fade	fade	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0012
+# text = frown
+1	frown	frown	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0013
+# text = glimpses
+1	glimpses	glimpse	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0014
+# text = gasses
+1	gasses	gas	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0015
+# text = gowns
+1	gowns	gown	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0016
+# text = hassled
+1	hassled	hassled	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0017
+# text = loose
+1	loose	loose	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0018
+# text = lounges
+1	lounges	lounge	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0019
+# text = lower
+1	lower	lower	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0020
+# text = mans
+1	mans	man	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0021
+# text = nod
+1	nod	nod	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0022
+# text = respects
+1	respects	respect	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0023
+# text = revenges
+1	revenges	revenge	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0024
+# text = rights
+1	rights	right	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0025
+# text = singles
+1	singles	single	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0026
+# text = skis
+1	skis	ski	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0027
+# text = slot
+1	slot	slot	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0028
+# text = teases
+1	teases	tease	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0029
+# text = thanks
+1	thanks	thank	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0030
+# text = traps
+1	traps	trap	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0031
+# text = troops
+1	troops	troop	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0032
+# text = whiles
+1	whiles	while	VERB	_	_	0	root	_	_
+
+# sent_id = missing_lemma_0033
+# text = widen
+1	widen	widen	VERB	_	_	0	root	_	_
+
diff --git a/english-lemmas/en_lemmas.txt b/english-lemmas/en_lemmas.txt
@@ -0,0 +1,34 @@
+acquires       acquire
+analyses       analyze
+bolt           bolt
+buys           buy
+diagnoses      diagnose
+discloses      disclose
+disguises      disguise
+dose           dose
+downs          down
+echoes         echo
+eschewing      eschew
+fade           fade
+frown          frown
+glimpses       glimpse
+gasses         gas
+gowns          gown
+hassled        hassled
+loose          loose
+lounges        lounge
+lower          lower
+mans           man
+nod            nod
+respects       respect
+revenges       revenge
+rights         right
+singles        single
+skis           ski
+slot           slot
+teases         tease
+thanks         thank
+traps          trap
+troops         troop
+whiles         while
+widen          widen
diff --git a/english-lemmas/fake_en_lemmas.py b/english-lemmas/fake_en_lemmas.py
@@ -0,0 +1,13 @@
+with open("en_lemmas.txt") as fin:
+    lines = fin.readlines()
+
+lines = [x.strip() for x in lines]
+lines = [x for x in lines if x and not x.startswith("#")]
+
+lemmas = [x.split() for x in lines]
+
+for idx, (word, lemma) in enumerate(lemmas):
+    print("# sent_id = missing_lemma_%04d" % idx)
+    print("# text = %s" % word)
+    print("1\t%s\t%s\tVERB\t_\t_\t0\troot\t_\t_" % (word, lemma))
+    print()
diff --git a/english-lemmas/readme.txt b/english-lemmas/readme.txt
@@ -0,0 +1,3 @@
+A few lemmas for verbs missing from the English datasets, which the
+Stanza lemmatizer was getting wrong, provided by Prof. Guy Lapalme
+from Montreal.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+A few lemmas for verbs missing from the English datasets, which the`
	`2`	`+Stanza lemmatizer was getting wrong, provided by Prof. Guy Lapalme`
	`3`	`+from Montreal.`