Merge pull request nltk#3400 from ekaf/hotfix-3399

stevenbird · web-flow · commit d30a954a3247 · 2025-07-06T12:04:15.000+09:30
Duplicate only real adjective satellites
diff --git a/nltk/corpus/reader/sentiwordnet.py b/nltk/corpus/reader/sentiwordnet.py
@@ -23,7 +23,7 @@
  SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
  SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
  SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
- SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
+ SentiSynset('dull.s.05'), SentiSynset('slowly.r.01'),\
  SentiSynset('behind.r.03')]
     >>> happy = swn.senti_synsets('happy', 'a')
     >>> happy0 = list(happy)[0]
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
@@ -1204,6 +1204,9 @@ def __init__(self, root, omw_reader):
                 assert int(index) == i
                 self._lexnames.append(lexname)
 
+        # Build a set of adjective satellite offsets
+        self._scan_satellites()
+
         # Load the indices for lemmas and synset offsets
         self._load_lemma_pos_offset_map()
 
@@ -1324,7 +1327,7 @@ def _load_lang_data(self, lang):
             self.add_omw()
 
         if lang not in self.langs():
-            raise WordNetError("Language is not supported.")
+            raise WordNetError(f"Language {lang} is not supported.")
 
         if self._exomw_reader and lang not in self.omw_langs:
             reader = self._exomw_reader
@@ -1379,6 +1382,30 @@ def langs(self):
         """return a list of languages supported by Multilingual Wordnet"""
         return list(self.provenances.keys())
 
+    def _scan_satellites(self):
+        """
+        Scans the adjective data file and populates self.satellite_offsets with all adjective satellite synset offsets.
+
+        This method reads the adjective data file associated with the corpus reader,
+        identifies synsets of type 's' (adjective satellites), and adds their offsets
+        to the self.satellite_offsets set. The method does not return a value.
+        """
+        adj_data_file = self._data_file(ADJ)
+        satellite_offsets = set()
+        adj_data_file.seek(0)
+        for line in adj_data_file:
+            if not line.strip() or line.startswith(" "):
+                continue
+            fields = line.strip().split()
+            if len(fields) < 3:
+                continue
+            synset_offset = fields[0]
+            synset_type = fields[2]
+            if synset_type == "s":
+                satellite_offsets.add(int(synset_offset))
+        adj_data_file.seek(0)  # Reset if needed elsewhere
+        self.satellite_offsets = satellite_offsets
+
     def _load_lemma_pos_offset_map(self):
         for suffix in self._FILEMAP.values():
             # parse each line of the file (ignoring comment lines)
@@ -1425,8 +1452,15 @@ def _next_token():
                     # map lemmas and parts of speech to synsets
                     self._lemma_pos_offset_map[lemma][pos] = synset_offsets
                     if pos == ADJ:
-                        # Duplicate all adjectives indiscriminately?:
-                        self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
+                        # index.adj uses only the ADJ pos, so identify ADJ_SAT using satellites set
+                        satellite_offsets = [
+                            # Keep the ordering from index.adj
+                            offset
+                            for offset in synset_offsets
+                            if offset in self.satellite_offsets
+                        ]
+                        # Duplicate only a (possibly empty) list of real satellites
+                        self._lemma_pos_offset_map[lemma][ADJ_SAT] = satellite_offsets
 
     def _load_exception_map(self):
         # load the exception file data into memory
diff --git a/nltk/test/wordnet.doctest b/nltk/test/wordnet.doctest
@@ -156,7 +156,7 @@ Lemmas
     >>> wn.lemma_from_key(eat.key()).synset()
     Synset('feed.v.06')
     >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00')
-    Lemma('backward.s.03.feebleminded')
+    Lemma('backward.s.01.feebleminded')
     >>> for lemma in wn.synset('eat.v.03').lemmas():
     ...     print(lemma, lemma.count())
     ...
@@ -397,6 +397,29 @@ Walk through the noun synsets looking at their hypernyms:
     Synset('object.n.01') [Synset('physical_entity.n.01')]
 
 
+Issue 3399: When specifying pos="a", both head adjectives and adjective satellites are returned.
+
+    >>> from nltk.corpus import wordnet as wn
+    >>> # All adjective synsets (heads and satellites) for "good"
+    >>> syns_a = wn.synsets('good', pos='a')
+    >>> sorted(set(s.pos() for s in syns_a))
+    ['a', 's']
+    >>> # Only head adjectives
+    >>> syns_head = [s for s in syns_a if s.pos() == 'a']
+    >>> all(s.pos() == 'a' for s in syns_head)
+    True
+    >>> # Only satellites
+    >>> syns_sat = wn.synsets('good', pos='s')
+    >>> all(s.pos() == 's' for s in syns_sat)
+    True
+    >>> # The union when using pos='a' matches the combined sets
+    >>> set(syns_a) == set(syns_head) | set(syns_sat)
+    True
+    >>> # But pos='s' never returns head adjectives
+    >>> all(s.pos() != 'a' for s in wn.synsets('good', pos='s'))
+    True
+
+
 ------
 Morphy
 ------
@@ -648,7 +671,7 @@ Issue 3077: Incorrect part-of-speech filtering in all_synsets
     >>> next(wn.all_synsets(lang="hrv", pos="v"))
     Synset('breathe.v.01')
     >>> next(wn.all_synsets(lang="hrv", pos="s"))
-    Synset('ideological.s.02')
+    Synset('ideological.s.01')
     >>> next(wn.all_synsets(lang="hrv", pos="a"))
     Synset('able.a.01')