Merge branch 'develop' of https://github.com/nltk/nltk into develop

stevenbird · stevenbird · commit 980f435765ec · 2024-08-18T15:29:52.000+09:30
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
@@ -1421,6 +1421,7 @@ def _next_token():
                     # map lemmas and parts of speech to synsets
                     self._lemma_pos_offset_map[lemma][pos] = synset_offsets
                     if pos == ADJ:
+                        # Duplicate all adjectives indiscriminately?:
                         self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
 
     def _load_exception_map(self):
@@ -2016,8 +2017,9 @@ def morphy(self, form, pos=None, check_exceptions=True):
         """
         Find a possible base form for the given form, with the given
         part of speech, by checking WordNet's list of exceptional
-        forms, and by recursively stripping affixes for this part of
-        speech until a form in WordNet is found.
+        forms, or by substituting suffixes for this part of speech.
+        If pos=None, try every part of speech until finding lemmas.
+        Return the first form found in WordNet, or eventually None.
 
         >>> from nltk.corpus import wordnet as wn
         >>> print(wn.morphy('dogs'))
@@ -2033,19 +2035,11 @@ def morphy(self, form, pos=None, check_exceptions=True):
         book
         >>> wn.morphy('book', wn.ADJ)
         """
-
-        if pos is None:
-            morphy = self._morphy
-            analyses = chain(a for p in POS_LIST for a in morphy(form, p))
-        else:
+        for pos in [pos] if pos else POS_LIST:
             analyses = self._morphy(form, pos, check_exceptions)
-
-        # get the first one we find
-        first = list(islice(analyses, 1))
-        if len(first) == 1:
-            return first[0]
-        else:
-            return None
+            if analyses:
+                # Stop (don't try more parts of speech):
+                return analyses[0]
 
     MORPHOLOGICAL_SUBSTITUTIONS = {
         NOUN: [
@@ -2080,8 +2074,7 @@ def _morphy(self, form, pos, check_exceptions=True):
         # Given an original string x
         # 1. Apply rules once to the input to get y1, y2, y3, etc.
         # 2. Return all that are in the database
-        # 3. If there are no matches, keep applying rules until you either
-        #    find a match or you can't go any further
+        #    (edited by ekaf) If there are no matches return an empty list.
 
         exceptions = self._exception_map[pos]
         substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
@@ -2105,28 +2098,15 @@ def filter_forms(forms):
                             seen.add(form)
             return result
 
-        # 0. Check the exception lists
-        if check_exceptions:
-            if form in exceptions:
-                return filter_forms([form] + exceptions[form])
-
-        # 1. Apply rules once to the input to get y1, y2, y3, etc.
-        forms = apply_rules([form])
+        if check_exceptions and form in exceptions:
+            # 0. Check the exception lists
+            forms = exceptions[form]
+        else:
+            # 1. Apply rules once to the input to get y1, y2, y3, etc.
+            forms = apply_rules([form])
 
         # 2. Return all that are in the database (and check the original too)
-        results = filter_forms([form] + forms)
-        if results:
-            return results
-
-        # 3. If there are no matches, keep applying rules until we find a match
-        while forms:
-            forms = apply_rules(forms)
-            results = filter_forms(forms)
-            if results:
-                return results
-
-        # Return an empty list if we can't find anything
-        return []
+        return filter_forms([form] + forms)
 
     #############################################################
     # Create information content from corpus
diff --git a/nltk/downloader.py b/nltk/downloader.py
@@ -1270,7 +1270,7 @@ def _simple_interactive_config(self):
                     print("  Cancelled!")
                 else:
                     if not new_url.startswith(("http://", "https://")):
-                        new_url = "http://" + new_url
+                        new_url = "https://" + new_url
                     try:
                         self._ds.url = new_url
                     except Exception as e:
diff --git a/nltk/parse/generate.py b/nltk/parse/generate.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2024 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
+#         Eric Kafe <kafe.eric@gmail.com>
 # URL: <https://www.nltk.org/>
 # For license information, see LICENSE.TXT
 #
@@ -26,7 +27,8 @@ def generate(grammar, start=None, depth=None, n=None):
     if not start:
         start = grammar.start()
     if depth is None:
-        depth = sys.maxsize
+        # Safe default, assuming the grammar may be recursive:
+        depth = (sys.getrecursionlimit() // 3) - 3
 
     iter = _generate_all(grammar, [start], depth)
 
@@ -45,7 +47,8 @@ def _generate_all(grammar, items, depth):
         except RecursionError as error:
             # Helpful error message while still showing the recursion stack.
             raise RuntimeError(
-                "The grammar has rule(s) that yield infinite recursion!"
+                "The grammar has rule(s) that yield infinite recursion!\n\
+Eventually use a lower 'depth', or a higher 'sys.setrecursionlimit()'."
             ) from error
     else:
         yield []
diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2024 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
+#         Eric Kafe <kafe.eric@gmail.com>
 # URL: <https://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -13,8 +14,45 @@ class WordNetLemmatizer:
     """
     WordNet Lemmatizer
 
-    Lemmatize using WordNet's built-in morphy function.
-    Returns the input word unchanged if it cannot be found in WordNet.
+    Provides 3 lemmatizer modes:
+
+    1. _morphy() is an alias to WordNet's _morphy lemmatizer.
+    It returns a list of all lemmas found in WordNet.
+
+    >>> wnl = WordNetLemmatizer()
+    >>> print(wnl._morphy('us', 'n'))
+    ['us', 'u']
+
+    2. morphy() is a restrictive wrapper around _morphy().
+    It returns the first lemma found in WordNet,
+    or None if no lemma is found.
+
+    >>> print(wnl.morphy('us', 'n'))
+    us
+
+    >>> print(wnl.morphy('catss'))
+    None
+
+    3. lemmatize() is a permissive wrapper around _morphy().
+    It returns the shortest lemma found in WordNet,
+    or the input string unchanged if nothing is found.
+
+    >>> print(wnl.lemmatize('us', 'n'))
+    u
+
+    >>> print(wnl.lemmatize('Anythinggoeszxcv'))
+    Anythinggoeszxcv
+
+    """
+
+    morphy = wn.morphy
+
+    _morphy = wn._morphy
+
+    def lemmatize(self, word: str, pos: str = "n") -> str:
+        """Lemmatize `word` by picking the shortest of the possible lemmas,
+        using the wordnet corpus reader's built-in _morphy function.
+        Returns the input word unchanged if it cannot be found in WordNet.
 
         >>> from nltk.stem import WordNetLemmatizer
         >>> wnl = WordNetLemmatizer()
@@ -28,21 +66,16 @@ class WordNetLemmatizer:
         abacus
         >>> print(wnl.lemmatize('hardrock'))
         hardrock
-    """
-
-    def lemmatize(self, word: str, pos: str = "n") -> str:
-        """Lemmatize `word` using WordNet's built-in morphy function.
-        Returns the input word unchanged if it cannot be found in WordNet.
 
         :param word: The input word to lemmatize.
         :type word: str
         :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
             `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
             for satellite adjectives.
         :type pos: str
-        :return: The lemma of `word`, for the given `pos`.
+        :return: The shortest lemma of `word`, for the given `pos`.
         """
-        lemmas = wn._morphy(word, pos)
+        lemmas = self._morphy(word, pos)
         return min(lemmas, key=len) if lemmas else word
 
     def __repr__(self):