Skip to content

Commit b718276

Browse files
authored
Merge pull request nltk#3126 from ekaf/hotfix-3125
Avoid duplicate merged OMW synsets and lemmas
2 parents 796b03b + a3a7e53 commit b718276

File tree

1 file changed

+16
-9
lines changed

1 file changed

+16
-9
lines changed

nltk/corpus/reader/wordnet.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,7 @@ def __init__(self, root, omw_reader):
12161216
self.map30 = self.map_wn()
12171217

12181218
# Language data attributes
1219-
self.lg_attrs = ["lemma", "none", "def", "exe"]
1219+
self.lg_attrs = ["lemma", "of", "def", "exe"]
12201220

12211221
def index_sense(self, version=None):
12221222
"""Read sense key to synset id mapping from index.sense file in corpus directory"""
@@ -1251,7 +1251,7 @@ def map_to_many(self, version="wordnet"):
12511251
return synset_to_many
12521252

12531253
def map_to_one(self, version="wordnet"):
1254-
self.nomap[version] = []
1254+
self.nomap[version] = set()
12551255
self.splits[version] = {}
12561256
synset_to_many = self.map_to_many(version)
12571257
synset_to_one = {}
@@ -1273,7 +1273,7 @@ def map_to_one(self, version="wordnet"):
12731273
# where only Lithuanian and Slovak use the "s" ss_type.
12741274
synset_to_one[f"{source[:-1]}a"] = target
12751275
else:
1276-
self.nomap[version].append(source)
1276+
self.nomap[version].add(source)
12771277
return synset_to_one
12781278

12791279
def map_wn(self, version="wordnet"):
@@ -1294,7 +1294,9 @@ def merged_synsets(self, version="wordnet"):
12941294
for source, targets in self.map_to_many(version).items():
12951295
for target in targets:
12961296
merge[target].add(source)
1297-
self.merges[version] = {s: t for s, t in merge.items() if len(t) > 1}
1297+
self.merges[version] = {
1298+
trg: src for trg, src in merge.items() if len(src) > 1
1299+
}
12981300
return self.merges[version]
12991301

13001302
# Open Multilingual WordNet functions, contributed by
@@ -2230,8 +2232,9 @@ def custom_lemmas(self, tab_file, lang):
22302232
else:
22312233
# Some OMW offsets were never in Wordnet:
22322234
if (
2233-
offset_pos not in self.nomap
2234-
and offset_pos.replace("a", "s") not in self.nomap
2235+
offset_pos not in self.nomap["wordnet"]
2236+
and offset_pos.replace("a", "s")
2237+
not in self.nomap["wordnet"]
22352238
):
22362239
warnings.warn(
22372240
f"{lang}: invalid offset {offset_pos} in '{line}'"
@@ -2247,11 +2250,15 @@ def custom_lemmas(self, tab_file, lang):
22472250
if len(pair) == 1 or pair[0] == lg:
22482251
if attr == "lemma":
22492252
val = val.strip().replace(" ", "_")
2250-
self._lang_data[lang][1][val.lower()].append(offset_pos)
2253+
lang_offsets = self._lang_data[lang][1][val.lower()]
2254+
if offset_pos not in lang_offsets:
2255+
lang_offsets.append(offset_pos)
22512256
if attr in self.lg_attrs:
2252-
self._lang_data[lang][self.lg_attrs.index(attr)][
2257+
lang_lemmas = self._lang_data[lang][self.lg_attrs.index(attr)][
22532258
offset_pos
2254-
].append(val)
2259+
]
2260+
if val not in lang_lemmas:
2261+
lang_lemmas.append(val)
22552262

22562263
def disable_custom_lemmas(self, lang):
22572264
"""prevent synsets from being mistakenly added"""

0 commit comments

Comments
 (0)