Align text.ConcordanceIndex.find_concordance()

BroMattMiller · BroMattMiller · commit ce6a0a28ad4f · 2023-06-28T08:38:46.000-07:00
There were two cases where the alignment of
text.ConcordanceIndex.find_concordance() output was broken:

* The left context of the query was very small. That is, the query
  matched very close to the beginning of the text.
* The text or query contained Unicode combining characters.

The first case is addressed by right justifying the left context to the
proper width. The second case is addressed by replacing the standard
Python len(s) with a count that excludes combining characters, and
replacing s[:width] and s[-width:] with a new utility function that
accounts for combining characters.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -127,6 +127,7 @@
 - David McClosky
 - Xinfan Meng
 - Dmitrijs Milajevs
+- Matt Miller
 - Margaret Mitchell
 - Tomonori Nagano
 - Jason Narad
diff --git a/nltk/text.py b/nltk/text.py
@@ -16,6 +16,7 @@
 
 import re
 import sys
+import unicodedata
 from collections import Counter, defaultdict, namedtuple
 from functools import reduce
 from math import log
@@ -27,7 +28,7 @@
 from nltk.probability import ConditionalFreqDist as CFD
 from nltk.probability import FreqDist
 from nltk.tokenize import sent_tokenize
-from nltk.util import LazyConcatenation, tokenwrap
+from nltk.util import LazyConcatenation, cut_string, tokenwrap
 
 ConcordanceLine = namedtuple(
     "ConcordanceLine",
@@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
         else:
             phrase = [word]
 
-        half_width = (width - len(" ".join(phrase)) - 2) // 2
+        phrase_str = " ".join(phrase)
+        phrase_len = sum(1 for char in phrase_str if not unicodedata.combining(char))
+        half_width = (width - phrase_len - 2) // 2
         context = width // 4  # approx number of words of context
 
         # Find the instances of the word to create the ConcordanceLine
@@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
                 left_context = self._tokens[max(0, i - context) : i]
                 right_context = self._tokens[i + len(phrase) : i + context]
                 # Create the pretty lines with the query_word in the middle.
-                left_print = " ".join(left_context)[-half_width:]
-                right_print = " ".join(right_context)[:half_width]
+                left_print = cut_string(" ".join(left_context), -half_width).rjust(
+                    half_width
+                )
+                right_print = cut_string(" ".join(right_context), half_width)
                 # The WYSIWYG line of the concordance.
                 line_print = " ".join([left_print, query_word, right_print])
                 # Create the ConcordanceLine
diff --git a/nltk/util.py b/nltk/util.py
@@ -12,6 +12,7 @@
 import pydoc
 import re
 import textwrap
+import unicodedata
 import warnings
 from collections import defaultdict, deque
 from itertools import chain, combinations, islice, tee
@@ -139,6 +140,41 @@ def tokenwrap(tokens, separator=" ", width=70):
     return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
 
 
+def cut_string(s, width=70):
+    """
+    Cut off and return a given width of a string
+
+    Return the same as s[:width] if width >= 0 or s[-width:] if
+    width < 0, as long as s has no unicode combining characters.
+    If it has combining characters make sure the returned string's
+    visible width matches the called-for width.
+
+    :param s: the string to cut
+    :type s: str
+    :param width: the display_width
+    :type width: int
+    """
+    chars_sofar = 0
+    width_sofar = 0
+    result = ""
+
+    abs_width = abs(width)
+    max_chars = len(s)
+    while width_sofar < abs_width and chars_sofar < max_chars:
+        if width < 0:
+            char = s[-(chars_sofar + 1)]
+            result = char + result
+        else:
+            char = s[chars_sofar]
+            result = result + char
+
+        chars_sofar += 1
+        if not unicodedata.combining(char):
+            width_sofar += 1
+
+    return result
+
+
 ##########################################################################
 # Indexing
 ##########################################################################