Skip to content

Commit 1806a72

Browse files
committed
Add some one word fake sentences with missing lemmas for Stanza
1 parent 21e8d9e commit 1806a72

File tree

5 files changed

+220
-0
lines changed

5 files changed

+220
-0
lines changed

english-lemmas/check_lemmas.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import argparse
2+
3+
parser = argparse.ArgumentParser()
4+
parser.add_argument('--lemmas', default="en_lemmas.txt", type=str, help='Where to find the lemmas to look for')
5+
parser.add_argument('--conllu', default="/home/john/stanza/data/lemma/en_combined.train.in.conllu", type=str, help='Where to find the training data to check')
6+
args = parser.parse_args()
7+
8+
lemma_file = args.lemmas
9+
conllu_file = args.conllu
10+
11+
with open(lemma_file) as fin:
12+
lines = fin.readlines()
13+
14+
lemmas = [x.strip().split() for x in lines]
15+
16+
with open(conllu_file) as fin:
17+
lines = fin.readlines()
18+
lines = [x.strip() for x in lines]
19+
20+
for lemma in lemmas:
21+
text = "\t%s\t" % lemma[0]
22+
print("Looking for %s" % text)
23+
for line in lines:
24+
if text in line and "VERB" in line:
25+
print(line)
26+
27+
print("===================")
28+
29+
for lemma in lemmas:
30+
text = "\t%s\t" % lemma[0]
31+
print("Looking for %s" % text)
32+
for line in lines:
33+
if text in line and "VERB" not in line:
34+
print(line)

english-lemmas/en_lemmas.conllu

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# sent_id = missing_lemma_0000
2+
# text = acquires
3+
1 acquires acquire VERB _ _ 0 root _ _
4+
5+
# sent_id = missing_lemma_0001
6+
# text = analyses
7+
1 analyses analyze VERB _ _ 0 root _ _
8+
9+
# sent_id = missing_lemma_0002
10+
# text = bolt
11+
1 bolt bolt VERB _ _ 0 root _ _
12+
13+
# sent_id = missing_lemma_0003
14+
# text = buys
15+
1 buys buy VERB _ _ 0 root _ _
16+
17+
# sent_id = missing_lemma_0004
18+
# text = diagnoses
19+
1 diagnoses diagnose VERB _ _ 0 root _ _
20+
21+
# sent_id = missing_lemma_0005
22+
# text = discloses
23+
1 discloses disclose VERB _ _ 0 root _ _
24+
25+
# sent_id = missing_lemma_0006
26+
# text = disguises
27+
1 disguises disguise VERB _ _ 0 root _ _
28+
29+
# sent_id = missing_lemma_0007
30+
# text = dose
31+
1 dose dose VERB _ _ 0 root _ _
32+
33+
# sent_id = missing_lemma_0008
34+
# text = downs
35+
1 downs down VERB _ _ 0 root _ _
36+
37+
# sent_id = missing_lemma_0009
38+
# text = echoes
39+
1 echoes echo VERB _ _ 0 root _ _
40+
41+
# sent_id = missing_lemma_0010
42+
# text = eschewing
43+
1 eschewing eschew VERB _ _ 0 root _ _
44+
45+
# sent_id = missing_lemma_0011
46+
# text = fade
47+
1 fade fade VERB _ _ 0 root _ _
48+
49+
# sent_id = missing_lemma_0012
50+
# text = frown
51+
1 frown frown VERB _ _ 0 root _ _
52+
53+
# sent_id = missing_lemma_0013
54+
# text = glimpses
55+
1 glimpses glimpse VERB _ _ 0 root _ _
56+
57+
# sent_id = missing_lemma_0014
58+
# text = gasses
59+
1 gasses gas VERB _ _ 0 root _ _
60+
61+
# sent_id = missing_lemma_0015
62+
# text = gowns
63+
1 gowns gown VERB _ _ 0 root _ _
64+
65+
# sent_id = missing_lemma_0016
66+
# text = hassled
67+
1 hassled hassled VERB _ _ 0 root _ _
68+
69+
# sent_id = missing_lemma_0017
70+
# text = loose
71+
1 loose loose VERB _ _ 0 root _ _
72+
73+
# sent_id = missing_lemma_0018
74+
# text = lounges
75+
1 lounges lounge VERB _ _ 0 root _ _
76+
77+
# sent_id = missing_lemma_0019
78+
# text = lower
79+
1 lower lower VERB _ _ 0 root _ _
80+
81+
# sent_id = missing_lemma_0020
82+
# text = mans
83+
1 mans man VERB _ _ 0 root _ _
84+
85+
# sent_id = missing_lemma_0021
86+
# text = nod
87+
1 nod nod VERB _ _ 0 root _ _
88+
89+
# sent_id = missing_lemma_0022
90+
# text = respects
91+
1 respects respect VERB _ _ 0 root _ _
92+
93+
# sent_id = missing_lemma_0023
94+
# text = revenges
95+
1 revenges revenge VERB _ _ 0 root _ _
96+
97+
# sent_id = missing_lemma_0024
98+
# text = rights
99+
1 rights right VERB _ _ 0 root _ _
100+
101+
# sent_id = missing_lemma_0025
102+
# text = singles
103+
1 singles single VERB _ _ 0 root _ _
104+
105+
# sent_id = missing_lemma_0026
106+
# text = skis
107+
1 skis ski VERB _ _ 0 root _ _
108+
109+
# sent_id = missing_lemma_0027
110+
# text = slot
111+
1 slot slot VERB _ _ 0 root _ _
112+
113+
# sent_id = missing_lemma_0028
114+
# text = teases
115+
1 teases tease VERB _ _ 0 root _ _
116+
117+
# sent_id = missing_lemma_0029
118+
# text = thanks
119+
1 thanks thank VERB _ _ 0 root _ _
120+
121+
# sent_id = missing_lemma_0030
122+
# text = traps
123+
1 traps trap VERB _ _ 0 root _ _
124+
125+
# sent_id = missing_lemma_0031
126+
# text = troops
127+
1 troops troop VERB _ _ 0 root _ _
128+
129+
# sent_id = missing_lemma_0032
130+
# text = whiles
131+
1 whiles while VERB _ _ 0 root _ _
132+
133+
# sent_id = missing_lemma_0033
134+
# text = widen
135+
1 widen widen VERB _ _ 0 root _ _
136+

english-lemmas/en_lemmas.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
acquires acquire
2+
analyses analyze
3+
bolt bolt
4+
buys buy
5+
diagnoses diagnose
6+
discloses disclose
7+
disguises disguise
8+
dose dose
9+
downs down
10+
echoes echo
11+
eschewing eschew
12+
fade fade
13+
frown frown
14+
glimpses glimpse
15+
gasses gas
16+
gowns gown
17+
hassled hassled
18+
loose loose
19+
lounges lounge
20+
lower lower
21+
mans man
22+
nod nod
23+
respects respect
24+
revenges revenge
25+
rights right
26+
singles single
27+
skis ski
28+
slot slot
29+
teases tease
30+
thanks thank
31+
traps trap
32+
troops troop
33+
whiles while
34+
widen widen

english-lemmas/fake_en_lemmas.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
with open("en_lemmas.txt") as fin:
2+
lines = fin.readlines()
3+
4+
lines = [x.strip() for x in lines]
5+
lines = [x for x in lines if x and not x.startswith("#")]
6+
7+
lemmas = [x.split() for x in lines]
8+
9+
for idx, (word, lemma) in enumerate(lemmas):
10+
print("# sent_id = missing_lemma_%04d" % idx)
11+
print("# text = %s" % word)
12+
print("1\t%s\t%s\tVERB\t_\t_\t0\troot\t_\t_" % (word, lemma))
13+
print()

english-lemmas/readme.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
A few lemmas for verbs missing from the English datasets, which the
2+
Stanza lemmatizer was getting wrong, provided by Prof. Guy Lapalme
3+
from Montreal.

0 commit comments

Comments
 (0)