Skip to content

Commit 280e2de

Browse files
authored
Merge pull request nltk#3393 from ekaf/hotfix-3392
Make Wordnet interoperable with various taggers and tagged corpora
2 parents ebaf5f9 + ba18c70 commit 280e2de

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

nltk/corpus/reader/wordnet.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from nltk.corpus.reader import CorpusReader
4343
from nltk.internals import deprecated
4444
from nltk.probability import FreqDist
45+
from nltk.tag import map_tag
4546
from nltk.util import binary_search_file as _binary_search_file
4647

4748
######################################################################
@@ -70,6 +71,9 @@
7071

7172
POS_LIST = [NOUN, VERB, ADJ, ADV]
7273

74+
# Convert from Universal Tags (Petrov et al., 2012) to Wordnet Pos
75+
UNIVERSAL_TAG_TO_WN_POS = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
76+
7377
# A table of strings that are used to express verb frames.
7478
VERB_FRAME_STRINGS = (
7579
None,
@@ -2108,6 +2112,38 @@ def filter_forms(forms):
21082112
# 2. Return all that are in the database (and check the original too)
21092113
return filter_forms([form] + forms)
21102114

2115+
def tag2pos(self, tag, tagset="en-ptb"):
2116+
"""
2117+
Convert a tag from one of the tagsets in nltk_data/taggers/universal_tagset to a
2118+
WordNet Part-of-Speech, using Universal Tags (Petrov et al., 2012) as intermediary.
2119+
Return None when WordNet does not cover that POS.
2120+
2121+
:param tag: The part-of-speech tag to convert.
2122+
:type tag: str
2123+
:param tagset: The tagset of the input tag. Defaults to "en-ptb".
2124+
Supported tagsets are those recognized by the `map_tag` function
2125+
from `nltk.tag`. Common examples include:
2126+
- "en-ptb" (Penn Treebank tagset for English)
2127+
- "en-brown" (Brown tagset)
2128+
For a complete list of supported tagsets, refer to the `map_tag`
2129+
documentation or its source code in the NLTK library.
2130+
:type tagset: str
2131+
2132+
:returns: The corresponding WordNet POS tag ('n', 'v', 'a', 'r') or None
2133+
if the tag cannot be mapped to a WordNet POS.
2134+
:rtype: str or None
2135+
2136+
Example:
2137+
>>> import nltk
2138+
>>> tagged = nltk.tag.pos_tag(nltk.tokenize.word_tokenize("Banks check books."))
2139+
>>> print([(word, tag, nltk.corpus.wordnet.tag2pos(tag)) for word, tag in tagged])
2140+
[('Banks', 'NNS', 'n'), ('check', 'VBP', 'v'), ('books', 'NNS', 'n'), ('.', '.', None)]
2141+
"""
2142+
if tagset != "universal":
2143+
tag = map_tag(tagset, "universal", tag)
2144+
2145+
return UNIVERSAL_TAG_TO_WN_POS.get(tag, None)
2146+
21112147
#############################################################
21122148
# Create information content from corpus
21132149
#############################################################

nltk/test/unit/test_wordnet.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,49 @@ def test_iterable_type_for_all_lemma_names(self):
244244
self.assertTrue(hasattr(cat_lemmas, "__iter__"))
245245
self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
246246
self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
247+
248+
def test_en_ptb_tags(self):
249+
# Common PTB tags (mapped in both PTB and Brown)
250+
self.assertEqual(wn.tag2pos("NN"), "n") # noun
251+
self.assertEqual(wn.tag2pos("VB"), "v") # verb
252+
self.assertEqual(wn.tag2pos("JJ"), "a") # adjective
253+
self.assertEqual(wn.tag2pos("RB"), "r") # adverb
254+
255+
# PTB-specific tags (mapped in PTB, not in Brown)
256+
self.assertEqual(wn.tag2pos("NNS"), "n") # plural noun (PTB only)
257+
self.assertEqual(wn.tag2pos("VBD"), "v") # verb, past tense (PTB only)
258+
self.assertEqual(
259+
wn.tag2pos("VBG"), "v"
260+
) # verb, gerund/present participle (PTB only)
261+
self.assertEqual(wn.tag2pos("JJR"), "a") # adjective, comparative (PTB only)
262+
self.assertEqual(wn.tag2pos("RBR"), "r") # adverb, comparative (PTB only)
263+
264+
# Tags that should yield None (not mapped in WordNet)
265+
self.assertIsNone(wn.tag2pos("PRP"))
266+
self.assertIsNone(wn.tag2pos("WP"))
267+
self.assertIsNone(wn.tag2pos("TO"))
268+
self.assertIsNone(wn.tag2pos("PRT"))
269+
self.assertIsNone(wn.tag2pos("POS"))
270+
self.assertIsNone(wn.tag2pos("."))
271+
272+
def test_en_brown_tags(self):
273+
# Common Brown tags (mapped in both PTB and Brown)
274+
self.assertEqual(wn.tag2pos("NN", tagset="en-brown"), "n") # noun
275+
self.assertEqual(wn.tag2pos("VB", tagset="en-brown"), "v") # verb
276+
self.assertEqual(wn.tag2pos("JJ", tagset="en-brown"), "a") # adjective
277+
self.assertEqual(wn.tag2pos("RB", tagset="en-brown"), "r") # adverb
278+
279+
# Brown-specific tags (mapped in Brown, not in PTB)
280+
self.assertEqual(
281+
wn.tag2pos("HV", tagset="en-brown"), "v"
282+
) # 'have' auxiliary (Brown only)
283+
self.assertEqual(
284+
wn.tag2pos("BEZ", tagset="en-brown"), "v"
285+
) # 'be' auxiliary, 3rd person singular present (Brown only)
286+
self.assertEqual(
287+
wn.tag2pos("DOZ", tagset="en-brown"), "v"
288+
) # 'do' auxiliary, 3rd person singular present (Brown only)
289+
290+
# Tags that should yield None (not mapped in WordNet)
291+
self.assertIsNone(wn.tag2pos("PPL", tagset="en-brown"))
292+
self.assertIsNone(wn.tag2pos("(", tagset="en-brown"))

0 commit comments

Comments
 (0)