Merge pull request nltk#3393 from ekaf/hotfix-3392

stevenbird · web-flow · commit 280e2de15703 · 2025-06-12T15:07:22.000+09:30
Make Wordnet interoperable with various taggers and tagged corpora
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
@@ -42,6 +42,7 @@
 from nltk.corpus.reader import CorpusReader
 from nltk.internals import deprecated
 from nltk.probability import FreqDist
+from nltk.tag import map_tag
 from nltk.util import binary_search_file as _binary_search_file
 
 ######################################################################
@@ -70,6 +71,9 @@
 
 POS_LIST = [NOUN, VERB, ADJ, ADV]
 
+# Convert from Universal Tags (Petrov et al., 2012) to Wordnet Pos
+UNIVERSAL_TAG_TO_WN_POS = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
+
 # A table of strings that are used to express verb frames.
 VERB_FRAME_STRINGS = (
     None,
@@ -2108,6 +2112,38 @@ def filter_forms(forms):
         # 2. Return all that are in the database (and check the original too)
         return filter_forms([form] + forms)
 
+    def tag2pos(self, tag, tagset="en-ptb"):
+        """
+        Convert a tag from one of the tagsets in nltk_data/taggers/universal_tagset to a
+        WordNet Part-of-Speech, using Universal Tags (Petrov et al., 2012) as intermediary.
+        Return None when WordNet does not cover that POS.
+
+        :param tag: The part-of-speech tag to convert.
+        :type tag: str
+        :param tagset: The tagset of the input tag. Defaults to "en-ptb".
+            Supported tagsets are those recognized by the `map_tag` function
+            from `nltk.tag`. Common examples include:
+                - "en-ptb" (Penn Treebank tagset for English)
+                - "en-brown" (Brown tagset)
+            For a complete list of supported tagsets, refer to the `map_tag`
+            documentation or its source code in the NLTK library.
+        :type tagset: str
+
+        :returns: The corresponding WordNet POS tag ('n', 'v', 'a', 'r') or None
+            if the tag cannot be mapped to a WordNet POS.
+        :rtype: str or None
+
+        Example:
+            >>> import nltk
+            >>> tagged = nltk.tag.pos_tag(nltk.tokenize.word_tokenize("Banks check books."))
+            >>> print([(word, tag, nltk.corpus.wordnet.tag2pos(tag)) for word, tag in tagged])
+            [('Banks', 'NNS', 'n'), ('check', 'VBP', 'v'), ('books', 'NNS', 'n'), ('.', '.', None)]
+        """
+        if tagset != "universal":
+            tag = map_tag(tagset, "universal", tag)
+
+        return UNIVERSAL_TAG_TO_WN_POS.get(tag, None)
+
     #############################################################
     # Create information content from corpus
     #############################################################
diff --git a/nltk/test/unit/test_wordnet.py b/nltk/test/unit/test_wordnet.py
@@ -244,3 +244,49 @@ def test_iterable_type_for_all_lemma_names(self):
         self.assertTrue(hasattr(cat_lemmas, "__iter__"))
         self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
         self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
+
+    def test_en_ptb_tags(self):
+        # Common PTB tags (mapped in both PTB and Brown)
+        self.assertEqual(wn.tag2pos("NN"), "n")  # noun
+        self.assertEqual(wn.tag2pos("VB"), "v")  # verb
+        self.assertEqual(wn.tag2pos("JJ"), "a")  # adjective
+        self.assertEqual(wn.tag2pos("RB"), "r")  # adverb
+
+        # PTB-specific tags (mapped in PTB, not in Brown)
+        self.assertEqual(wn.tag2pos("NNS"), "n")  # plural noun (PTB only)
+        self.assertEqual(wn.tag2pos("VBD"), "v")  # verb, past tense (PTB only)
+        self.assertEqual(
+            wn.tag2pos("VBG"), "v"
+        )  # verb, gerund/present participle (PTB only)
+        self.assertEqual(wn.tag2pos("JJR"), "a")  # adjective, comparative (PTB only)
+        self.assertEqual(wn.tag2pos("RBR"), "r")  # adverb, comparative (PTB only)
+
+        # Tags that should yield None (not mapped in WordNet)
+        self.assertIsNone(wn.tag2pos("PRP"))
+        self.assertIsNone(wn.tag2pos("WP"))
+        self.assertIsNone(wn.tag2pos("TO"))
+        self.assertIsNone(wn.tag2pos("PRT"))
+        self.assertIsNone(wn.tag2pos("POS"))
+        self.assertIsNone(wn.tag2pos("."))
+
+    def test_en_brown_tags(self):
+        # Common Brown tags (mapped in both PTB and Brown)
+        self.assertEqual(wn.tag2pos("NN", tagset="en-brown"), "n")  # noun
+        self.assertEqual(wn.tag2pos("VB", tagset="en-brown"), "v")  # verb
+        self.assertEqual(wn.tag2pos("JJ", tagset="en-brown"), "a")  # adjective
+        self.assertEqual(wn.tag2pos("RB", tagset="en-brown"), "r")  # adverb
+
+        # Brown-specific tags (mapped in Brown, not in PTB)
+        self.assertEqual(
+            wn.tag2pos("HV", tagset="en-brown"), "v"
+        )  # 'have' auxiliary (Brown only)
+        self.assertEqual(
+            wn.tag2pos("BEZ", tagset="en-brown"), "v"
+        )  # 'be' auxiliary, 3rd person singular present (Brown only)
+        self.assertEqual(
+            wn.tag2pos("DOZ", tagset="en-brown"), "v"
+        )  # 'do' auxiliary, 3rd person singular present (Brown only)
+
+        # Tags that should yield None (not mapped in WordNet)
+        self.assertIsNone(wn.tag2pos("PPL", tagset="en-brown"))
+        self.assertIsNone(wn.tag2pos("(", tagset="en-brown"))