Skip to content

Commit d30a954

Browse files
authored
Merge pull request nltk#3400 from ekaf/hotfix-3399
Duplicate only real adjective satellites
2 parents 75a63e5 + 3e91f48 commit d30a954

File tree

3 files changed

+63
-6
lines changed

3 files changed

+63
-6
lines changed

nltk/corpus/reader/sentiwordnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
2424
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
2525
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
26-
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
26+
SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
2727
SentiSynset('behind.r.03')]
2828
>>> happy = swn.senti_synsets('happy', 'a')
2929
>>> happy0 = list(happy)[0]

nltk/corpus/reader/wordnet.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,9 @@ def __init__(self, root, omw_reader):
12041204
assert int(index) == i
12051205
self._lexnames.append(lexname)
12061206

1207+
# Build a set of adjective satellite offsets
1208+
self._scan_satellites()
1209+
12071210
# Load the indices for lemmas and synset offsets
12081211
self._load_lemma_pos_offset_map()
12091212

@@ -1324,7 +1327,7 @@ def _load_lang_data(self, lang):
13241327
self.add_omw()
13251328

13261329
if lang not in self.langs():
1327-
raise WordNetError("Language is not supported.")
1330+
raise WordNetError(f"Language {lang} is not supported.")
13281331

13291332
if self._exomw_reader and lang not in self.omw_langs:
13301333
reader = self._exomw_reader
@@ -1379,6 +1382,30 @@ def langs(self):
13791382
"""return a list of languages supported by Multilingual Wordnet"""
13801383
return list(self.provenances.keys())
13811384

1385+
def _scan_satellites(self):
1386+
"""
1387+
Scans the adjective data file and populates self.satellite_offsets with all adjective satellite synset offsets.
1388+
1389+
This method reads the adjective data file associated with the corpus reader,
1390+
identifies synsets of type 's' (adjective satellites), and adds their offsets
1391+
to the self.satellite_offsets set. The method does not return a value.
1392+
"""
1393+
adj_data_file = self._data_file(ADJ)
1394+
satellite_offsets = set()
1395+
adj_data_file.seek(0)
1396+
for line in adj_data_file:
1397+
if not line.strip() or line.startswith(" "):
1398+
continue
1399+
fields = line.strip().split()
1400+
if len(fields) < 3:
1401+
continue
1402+
synset_offset = fields[0]
1403+
synset_type = fields[2]
1404+
if synset_type == "s":
1405+
satellite_offsets.add(int(synset_offset))
1406+
adj_data_file.seek(0) # Reset if needed elsewhere
1407+
self.satellite_offsets = satellite_offsets
1408+
13821409
def _load_lemma_pos_offset_map(self):
13831410
for suffix in self._FILEMAP.values():
13841411
# parse each line of the file (ignoring comment lines)
@@ -1425,8 +1452,15 @@ def _next_token():
14251452
# map lemmas and parts of speech to synsets
14261453
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
14271454
if pos == ADJ:
1428-
# Duplicate all adjectives indiscriminately?:
1429-
self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
1455+
# index.adj uses only the ADJ pos, so identify ADJ_SAT using satellites set
1456+
satellite_offsets = [
1457+
# Keep the ordering from index.adj
1458+
offset
1459+
for offset in synset_offsets
1460+
if offset in self.satellite_offsets
1461+
]
1462+
# Duplicate only a (possibly empty) list of real satellites
1463+
self._lemma_pos_offset_map[lemma][ADJ_SAT] = satellite_offsets
14301464

14311465
def _load_exception_map(self):
14321466
# load the exception file data into memory

nltk/test/wordnet.doctest

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ Lemmas
156156
>>> wn.lemma_from_key(eat.key()).synset()
157157
Synset('feed.v.06')
158158
>>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00')
159-
Lemma('backward.s.03.feebleminded')
159+
Lemma('backward.s.01.feebleminded')
160160
>>> for lemma in wn.synset('eat.v.03').lemmas():
161161
... print(lemma, lemma.count())
162162
...
@@ -397,6 +397,29 @@ Walk through the noun synsets looking at their hypernyms:
397397
Synset('object.n.01') [Synset('physical_entity.n.01')]
398398

399399

400+
Issue 3399: When specifying pos="a", both head adjectives and adjective satellites are returned.
401+
402+
>>> from nltk.corpus import wordnet as wn
403+
>>> # All adjective synsets (heads and satellites) for "good"
404+
>>> syns_a = wn.synsets('good', pos='a')
405+
>>> sorted(set(s.pos() for s in syns_a))
406+
['a', 's']
407+
>>> # Only head adjectives
408+
>>> syns_head = [s for s in syns_a if s.pos() == 'a']
409+
>>> all(s.pos() == 'a' for s in syns_head)
410+
True
411+
>>> # Only satellites
412+
>>> syns_sat = wn.synsets('good', pos='s')
413+
>>> all(s.pos() == 's' for s in syns_sat)
414+
True
415+
>>> # The union when using pos='a' matches the combined sets
416+
>>> set(syns_a) == set(syns_head) | set(syns_sat)
417+
True
418+
>>> # But pos='s' never returns head adjectives
419+
>>> all(s.pos() != 'a' for s in wn.synsets('good', pos='s'))
420+
True
421+
422+
400423
------
401424
Morphy
402425
------
@@ -648,7 +671,7 @@ Issue 3077: Incorrect part-of-speech filtering in all_synsets
648671
>>> next(wn.all_synsets(lang="hrv", pos="v"))
649672
Synset('breathe.v.01')
650673
>>> next(wn.all_synsets(lang="hrv", pos="s"))
651-
Synset('ideological.s.02')
674+
Synset('ideological.s.01')
652675
>>> next(wn.all_synsets(lang="hrv", pos="a"))
653676
Synset('able.a.01')
654677

0 commit comments

Comments
 (0)