bpo-45692: Improve support of non-ASCII identifiers in IDLE

serhiy-storchaka · serhiy-storchaka · commit 51c42385c308 · 2021-11-03T10:39:58.000+02:00
diff --git a/Lib/idlelib/autocomplete.py b/Lib/idlelib/autocomplete.py
@@ -28,9 +28,8 @@
 TRY_A = False,    False,    False,   ATTRS  # '.' for attributes.
 TRY_F = False,    False,    False,   FILES  # '/' in quotes for file name.
 
-# This string includes all chars that may be in an identifier.
-# TODO Update this here and elsewhere.
-ID_CHARS = string.ascii_letters + string.digits + "_"
+# all ASCII chars that may be in an identifier
+_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
 
 SEPS = f"{os.sep}{os.altsep if os.altsep else ''}"
 TRIGGERS = f".{SEPS}"
@@ -134,7 +133,11 @@ def open_completions(self, args):
         elif hp.is_in_code() and (not mode or mode==ATTRS):
             self._remove_autocomplete_window()
             mode = ATTRS
-            while i and (curline[i-1] in ID_CHARS or ord(curline[i-1]) > 127):
+            while i:
+                c = curline[i-1]
+                if c not in _ASCII_ID_CHARS:
+                    if c <= '\x7f' or not ('a' + c).isidentifier():
+                        break
                 i -= 1
             comp_start = curline[i:j]
             if i and curline[i-1] == '.':  # Need object with attributes.
diff --git a/Lib/idlelib/autoexpand.py b/Lib/idlelib/autoexpand.py
@@ -13,12 +13,10 @@
 There is only one instance of Autoexpand.
 '''
 import re
-import string
 
+_LAST_WORD_RE = re.compile(r'\b\w+\Z')
 
 class AutoExpand:
-    wordchars = string.ascii_letters + string.digits + "_"
-
     def __init__(self, editwin):
         self.text = editwin.text
         self.bell = self.text.bell
@@ -85,10 +83,8 @@ def getwords(self):
     def getprevword(self):
         "Return the word prefix before the cursor."
         line = self.text.get("insert linestart", "insert")
-        i = len(line)
-        while i > 0 and line[i-1] in self.wordchars:
-            i = i-1
-        return line[i:]
+        m = _LAST_WORD_RE.search(line)
+        return m[0] if m else ''
 
 
 if __name__ == '__main__':
diff --git a/Lib/idlelib/editor.py b/Lib/idlelib/editor.py
@@ -3,7 +3,6 @@
 import os
 import platform
 import re
-import string
 import sys
 import tokenize
 import traceback
@@ -806,14 +805,12 @@ def ResetColorizer(self):
         if self.line_numbers is not None:
             self.line_numbers.update_colors()
 
-    IDENTCHARS = string.ascii_letters + string.digits + "_"
-
     def colorize_syntax_error(self, text, pos):
         text.tag_add("ERROR", pos)
         char = text.get(pos)
-        if char and char in self.IDENTCHARS:
+        if char and ('a' + char).isidentifier():
             text.tag_add("ERROR", pos + " wordstart", pos)
-        if '\n' == text.get(pos):   # error at line end
+        if char == '\n':   # error at line end
             text.mark_set("insert", pos)
         else:
             text.mark_set("insert", pos + "+1c")
diff --git a/Lib/idlelib/hyperparser.py b/Lib/idlelib/hyperparser.py
@@ -14,13 +14,6 @@
 # all ASCII chars that may be the first char of an identifier
 _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
 
-# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
-_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
-# lookup table for whether 7-bit ASCII chars are valid as the first
-# char in a Python identifier
-_IS_ASCII_ID_FIRST_CHAR = \
-    [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
-
 
 class HyperParser:
     def __init__(self, editwin, index):
@@ -166,53 +159,47 @@ def _eat_identifier(cls, str, limit, pos):
 
         This ignores non-identifier eywords are not identifiers.
         """
-        is_ascii_id_char = _IS_ASCII_ID_CHAR
-
         # Start at the end (pos) and work backwards.
         i = pos
 
         # Go backwards as long as the characters are valid ASCII
         # identifier characters. This is an optimization, since it
         # is faster in the common case where most of the characters
         # are ASCII.
-        while i > limit and (
-                ord(str[i - 1]) < 128 and
-                is_ascii_id_char[ord(str[i - 1])]
-        ):
+        while i > limit and str[i - 1] in _ASCII_ID_CHARS:
             i -= 1
 
         # If the above loop ended due to reaching a non-ASCII
         # character, continue going backwards using the most generic
         # test for whether a string contains only valid identifier
         # characters.
-        if i > limit and ord(str[i - 1]) >= 128:
-            while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
+        if i > limit and str[i - 1] > '\x7f':
+            while i - 4 >= limit and ('a' + str[i - 4:i]).isidentifier():
                 i -= 4
-            if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
+            if i - 2 >= limit and ('a' + str[i - 2:i]).isidentifier():
                 i -= 2
-            if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
+            if i - 1 >= limit and ('a' + str[i - 1]).isidentifier():
                 i -= 1
 
             # The identifier candidate starts here. If it isn't a valid
             # identifier, don't eat anything. At this point that is only
             # possible if the first character isn't a valid first
             # character for an identifier.
-            if not str[i:pos].isidentifier():
+            if i < pos and not str[i].isidentifier():
                 return 0
         elif i < pos:
             # All characters in str[i:pos] are valid ASCII identifier
             # characters, so it is enough to check that the first is
             # valid as the first character of an identifier.
-            if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
+            if str[i] not in _ASCII_ID_FIRST_CHARS:
                 return 0
 
         # All keywords are valid identifiers, but should not be
         # considered identifiers here, except for True, False and None.
-        if i < pos and (
-                iskeyword(str[i:pos]) and
-                str[i:pos] not in cls._ID_KEYWORDS
-        ):
-            return 0
+        if i < pos:
+            word = str[i:pos]
+            if iskeyword(word) and word not in cls._ID_KEYWORDS:
+                return 0
 
         return pos - i
 
diff --git a/Lib/idlelib/undo.py b/Lib/idlelib/undo.py
@@ -1,5 +1,3 @@
-import string
-
 from idlelib.delegator import Delegator
 
 # tkinter import not needed because module does not create widgets,
@@ -251,10 +249,8 @@ def merge(self, cmd):
         self.chars = self.chars + cmd.chars
         return True
 
-    alphanumeric = string.ascii_letters + string.digits + "_"
-
     def classify(self, c):
-        if c in self.alphanumeric:
+        if ('a' + c).isidentifier():
             return "alphanumeric"
         if c == "\n":
             return "newline"
diff --git a/Misc/NEWS.d/next/IDLE/2021-11-03-10-37-29.bpo-45692.QSuHbM.rst b/Misc/NEWS.d/next/IDLE/2021-11-03-10-37-29.bpo-45692.QSuHbM.rst
@@ -0,0 +1,3 @@
+Improve support of non-ASCII identifiers in IDLE
+(autoexpanding, autocompletion, undo, etc).y
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Improve support of non-ASCII identifiers in IDLE`
	`2`	`+(autoexpanding, autocompletion, undo, etc).y`
	`3`	`+`