Skip to content

Commit 3ca43e2

Browse files
ekaftomaarsen
andauthored
Fix wordnet's all_synsets() function (nltk#3078)
* Fix all_synsets() function * Add simple regression tests for nltk#3077 * Add suggestions by @tomaarsen Co-authored-by: Tom Aarsen <[email protected]>
1 parent 754c10c commit 3ca43e2

File tree

2 files changed

+117
-68
lines changed

2 files changed

+117
-68
lines changed

nltk/corpus/reader/wordnet.py

Lines changed: 99 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@
108108
"Somebody %s VERB-ing",
109109
"It %s that CLAUSE",
110110
"Something %s INFINITIVE",
111+
# OEWN additions:
112+
"Somebody %s at something",
113+
"Somebody %s for something",
114+
"Somebody %s on somebody",
115+
"Somebody %s out of somebody",
111116
)
112117

113118
SENSENUM_RE = re.compile(r"\.[\d]+\.")
@@ -454,7 +459,7 @@ def _doc(self, doc_type, default, lang="eng"):
454459
corpus._load_lang_data(lang)
455460
of = corpus.ss2of(self)
456461
i = corpus.lg_attrs.index(doc_type)
457-
if of in corpus._lang_data[lang][i].keys():
462+
if of in corpus._lang_data[lang][i]:
458463
return corpus._lang_data[lang][i][of]
459464
else:
460465
return None
@@ -481,11 +486,11 @@ def lemma_names(self, lang="eng"):
481486
if lang == "eng":
482487
return self._lemma_names
483488
else:
484-
self._wordnet_corpus_reader._load_lang_data(lang)
485-
486-
i = self._wordnet_corpus_reader.ss2of(self, lang)
487-
if i in self._wordnet_corpus_reader._lang_data[lang][0]:
488-
return self._wordnet_corpus_reader._lang_data[lang][0][i]
489+
reader = self._wordnet_corpus_reader
490+
reader._load_lang_data(lang)
491+
i = reader.ss2of(self)
492+
if i in reader._lang_data[lang][0]:
493+
return reader._lang_data[lang][0][i]
489494
else:
490495
return []
491496

@@ -1203,59 +1208,77 @@ def __init__(self, root, omw_reader):
12031208
# load the exception file data into memory
12041209
self._load_exception_map()
12051210

1211+
self.nomap = []
1212+
self.splits = {}
1213+
12061214
# map from WordNet 3.0 for OMW data
12071215
self.map30 = self.map_wn30()
12081216

12091217
# Language data attributes
12101218
self.lg_attrs = ["lemma", "none", "def", "exe"]
12111219

1212-
def corpus2sk(self, corpus=None):
1220+
def index_sense(self, version=None):
12131221
"""Read sense key to synset id mapping from index.sense file in corpus directory"""
12141222
fn = "index.sense"
1215-
if corpus:
1223+
if version:
12161224
from nltk.corpus import CorpusReader, LazyCorpusLoader
12171225

1218-
ixreader = LazyCorpusLoader(corpus, CorpusReader, r".*/" + fn)
1226+
ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn)
12191227
else:
12201228
ixreader = self
12211229
with ixreader.open(fn) as fp:
1222-
sk_map = {}
1230+
sensekey_map = {}
12231231
for line in fp:
1224-
items = line.strip().split(" ")
1225-
sk = items[0]
1226-
pos = self._pos_names[int(sk.split("%")[1].split(":")[0])]
1227-
sk_map[sk] = f"{items[1]}-{pos}"
1228-
return sk_map
1232+
fields = line.strip().split()
1233+
sensekey = fields[0]
1234+
pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])]
1235+
sensekey_map[sensekey] = f"{fields[1]}-{pos}"
1236+
return sensekey_map
1237+
1238+
def map_to_many(self):
1239+
sensekey_map1 = self.index_sense("wordnet")
1240+
sensekey_map2 = self.index_sense()
1241+
synset_to_many = {}
1242+
for synsetid in set(sensekey_map1.values()):
1243+
synset_to_many[synsetid] = []
1244+
for sensekey in set(sensekey_map1.keys()).intersection(
1245+
set(sensekey_map2.keys())
1246+
):
1247+
source = sensekey_map1[sensekey]
1248+
target = sensekey_map2[sensekey]
1249+
synset_to_many[source].append(target)
1250+
return synset_to_many
1251+
1252+
def map_to_one(self):
1253+
synset_to_many = self.map_to_many()
1254+
synset_to_one = {}
1255+
for source in synset_to_many:
1256+
candidates_bag = synset_to_many[source]
1257+
if candidates_bag:
1258+
candidates_set = set(candidates_bag)
1259+
if len(candidates_set) == 1:
1260+
target = candidates_bag[0]
1261+
else:
1262+
counts = []
1263+
for candidate in candidates_set:
1264+
counts.append((candidates_bag.count(candidate), candidate))
1265+
self.splits[source] = counts
1266+
target = max(counts)[1]
1267+
synset_to_one[source] = target
1268+
if source[-1] == "s":
1269+
# Add a mapping from "a" to target for applications like omw,
1270+
# where only Lithuanian and Slovak use the "s" ss_type.
1271+
synset_to_one[f"{source[:-1]}a"] = target
1272+
else:
1273+
self.nomap.append(source)
1274+
return synset_to_one
12291275

12301276
def map_wn30(self):
12311277
"""Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
12321278
if self.get_version() == "3.0":
12331279
return None
1234-
# warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}")
1235-
sk1 = self.corpus2sk("wordnet")
1236-
sk2 = self.corpus2sk()
1237-
1238-
skmap = {}
1239-
for sk in set(sk1.keys()).intersection(set(sk2.keys())):
1240-
of1 = sk1[sk]
1241-
of2 = sk2[sk]
1242-
if of1 not in skmap.keys():
1243-
skmap[of1] = [of2]
1244-
else:
1245-
skmap[of1].append(of2)
1246-
1247-
map30 = {}
1248-
for of in skmap.keys():
1249-
candidates = skmap[of]
1250-
# map to candidate that covers most lemmas:
1251-
of2 = max((candidates.count(x), x) for x in set(candidates))[1]
1252-
# warnings.warn(f"Map {of} {of2}")
1253-
map30[of] = of2
1254-
if of[-1] == "s":
1255-
# Add a mapping from "a" to "a" for applications like omw,
1256-
# which don't use the "s" ss_type:
1257-
map30[f"{of[:-1]}a"] = f"{of2[:-1]}a"
1258-
return map30
1280+
else:
1281+
return self.map_to_one()
12591282

12601283
# Open Multilingual WordNet functions, contributed by
12611284
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
@@ -1264,19 +1287,16 @@ def of2ss(self, of):
12641287
"""take an id and return the synsets"""
12651288
return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
12661289

1267-
def ss2of(self, ss, lang=None):
1290+
def ss2of(self, ss):
12681291
"""return the ID of the synset"""
1269-
pos = ss.pos()
1270-
# Only these 3 WordNets retain the satellite pos tag
1271-
if lang not in ["nld", "lit", "slk"] and pos == "s":
1272-
pos = "a"
1273-
return f"{ss.offset():08d}-{pos}"
1292+
if ss:
1293+
return f"{ss.offset():08d}-{ss.pos()}"
12741294

12751295
def _load_lang_data(self, lang):
12761296
"""load the wordnet data of the requested language from the file to
12771297
the cache, _lang_data"""
12781298

1279-
if lang in self._lang_data.keys():
1299+
if lang in self._lang_data:
12801300
return
12811301

12821302
if self._omw_reader and not self.omw_langs:
@@ -1308,7 +1328,7 @@ def add_provs(self, reader):
13081328
file_name, file_extension = os.path.splitext(langfile)
13091329
if file_extension == ".tab":
13101330
lang = file_name.split("-")[-1]
1311-
if lang in self.provenances.keys() or prov in ["cldr", "wikt"]:
1331+
if lang in self.provenances or prov in ["cldr", "wikt"]:
13121332
# We already have another resource for this lang,
13131333
# so we need to further specify the lang id:
13141334
lang = f"{lang}_{prov}"
@@ -1540,7 +1560,7 @@ def synset_from_pos_and_offset(self, pos, offset):
15401560
assert synset._offset == offset
15411561
self._synset_offset_cache[pos][offset] = synset
15421562
else:
1543-
synset = Synset(self)
1563+
synset = None
15441564
warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.")
15451565
data_file.seek(0)
15461566
return synset
@@ -1807,16 +1827,15 @@ def all_omw_synsets(self, pos=None, lang=None):
18071827
if lang not in self.langs():
18081828
return None
18091829
self._load_lang_data(lang)
1810-
for of in self._lang_data[lang][0].keys():
1811-
try:
1830+
for of in self._lang_data[lang][0]:
1831+
if not pos or of[-1] == pos:
18121832
ss = self.of2ss(of)
1813-
yield ss
1814-
except:
1815-
# A few OMW offsets don't exist in Wordnet 3.0.
1816-
# Additionally, when mapped to later Wordnets,
1817-
# increasing numbers of synsets are lost in the mapping.
1818-
# warnings.warn(f"Language {lang}: no synset found for {of}")
1819-
pass
1833+
if ss:
1834+
yield ss
1835+
1836+
# else:
1837+
# A few OMW offsets don't exist in Wordnet 3.0.
1838+
# warnings.warn(f"Language {lang}: no synset found for {of}")
18201839

18211840
def all_synsets(self, pos=None, lang="eng"):
18221841
"""Iterate over all synsets with a given part of speech tag.
@@ -1840,12 +1859,14 @@ def all_eng_synsets(self, pos=None):
18401859
# generate all synsets for each part of speech
18411860
for pos_tag in pos_tags:
18421861
# Open the file for reading. Note that we can not re-use
1843-
# the file poitners from self._data_file_map here, because
1862+
# the file pointers from self._data_file_map here, because
18441863
# we're defining an iterator, and those file pointers might
18451864
# be moved while we're not looking.
18461865
if pos_tag == ADJ_SAT:
1847-
pos_tag = ADJ
1848-
fileid = "data.%s" % self._FILEMAP[pos_tag]
1866+
pos_file = ADJ
1867+
else:
1868+
pos_file = pos_tag
1869+
fileid = "data.%s" % self._FILEMAP[pos_file]
18491870
data_file = self.open(fileid)
18501871

18511872
try:
@@ -1865,12 +1886,11 @@ def all_eng_synsets(self, pos=None):
18651886
# adjective satellites are in the same file as
18661887
# adjectives so only yield the synset if it's actually
18671888
# a satellite
1868-
if synset._pos == ADJ_SAT:
1889+
if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT:
18691890
yield synset
1870-
18711891
# for all other POS tags, yield all synsets (this means
18721892
# that adjectives also include adjective satellites)
1873-
else:
1893+
elif pos_tag != ADJ_SAT:
18741894
yield synset
18751895
offset = data_file.tell()
18761896
line = data_file.readline()
@@ -2187,13 +2207,24 @@ def custom_lemmas(self, tab_file, lang):
21872207
offset_pos, label = triple[:2]
21882208
val = triple[-1]
21892209
if self.map30:
2190-
if offset_pos in self.map30.keys():
2210+
if offset_pos in self.map30:
21912211
# Map offset_pos to current Wordnet version:
21922212
offset_pos = self.map30[offset_pos]
21932213
else:
2194-
# Synsets with no mapping keep their Wordnet 3.0 offset
2195-
# warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
2196-
pass
2214+
# Some OMW offsets were never in Wordnet:
2215+
if (
2216+
offset_pos not in self.nomap
2217+
and offset_pos.replace("a", "s") not in self.nomap
2218+
):
2219+
warnings.warn(
2220+
f"{lang}: invalid offset {offset_pos} in '{line}'"
2221+
)
2222+
continue
2223+
elif offset_pos[-1] == "a":
2224+
wnss = self.of2ss(offset_pos)
2225+
if wnss and wnss.pos() == "s": # Wordnet pos is "s"
2226+
# Label OMW adjective satellites back to their Wordnet pos ("s")
2227+
offset_pos = self.ss2of(wnss)
21972228
pair = label.split(":")
21982229
attr = pair[-1]
21992230
if len(pair) == 1 or pair[0] == lg:

nltk/test/wordnet.doctest

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,24 @@ Issue 2721: WordNetCorpusReader.ic() does not add smoothing to N
638638
>>> information_content(word, fake_ic) > 0
639639
True
640640

641+
Issue 3077: Incorrect part-of-speech filtering in all_synsets
642+
643+
>>> next(wn.all_synsets(pos="a"))
644+
Synset('able.a.01')
645+
>>> next(wn.all_synsets(pos="s"))
646+
Synset('emergent.s.02')
647+
>>> wn.add_omw()
648+
>>> next(wn.all_synsets(lang="hrv"))
649+
Synset('able.a.01')
650+
>>> next(wn.all_synsets(lang="hrv", pos="n"))
651+
Synset('entity.n.01')
652+
>>> next(wn.all_synsets(lang="hrv", pos="v"))
653+
Synset('breathe.v.01')
654+
>>> next(wn.all_synsets(lang="hrv", pos="s"))
655+
Synset('ideological.s.02')
656+
>>> next(wn.all_synsets(lang="hrv", pos="a"))
657+
Synset('able.a.01')
658+
641659

642660
------------------------------------------------
643661
Endlessness vs. intractability in relation trees

0 commit comments

Comments
 (0)