Merge pull request nltk#3172 from BroMattMiller/feature/Text-concordance-line-alignment

stevenbird · web-flow · commit 5f69622e2102 · 2023-12-17T15:44:24.000+09:30
Align text.ConcordanceIndex.find_concordance()
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -128,6 +128,7 @@
 - David McClosky
 - Xinfan Meng
 - Dmitrijs Milajevs
+- Matt Miller
 - Margaret Mitchell
 - Tomonori Nagano
 - Jason Narad
diff --git a/nltk/text.py b/nltk/text.py
@@ -16,6 +16,7 @@
 
 import re
 import sys
+import unicodedata
 from collections import Counter, defaultdict, namedtuple
 from functools import reduce
 from math import log
@@ -27,7 +28,7 @@
 from nltk.probability import ConditionalFreqDist as CFD
 from nltk.probability import FreqDist
 from nltk.tokenize import sent_tokenize
-from nltk.util import LazyConcatenation, tokenwrap
+from nltk.util import LazyConcatenation, cut_string, tokenwrap
 
 ConcordanceLine = namedtuple(
     "ConcordanceLine",
@@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
         else:
             phrase = [word]
 
-        half_width = (width - len(" ".join(phrase)) - 2) // 2
+        phrase_str = " ".join(phrase)
+        phrase_len = sum(1 for char in phrase_str if not unicodedata.combining(char))
+        half_width = (width - phrase_len - 2) // 2
         context = width // 4  # approx number of words of context
 
         # Find the instances of the word to create the ConcordanceLine
@@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
                 left_context = self._tokens[max(0, i - context) : i]
                 right_context = self._tokens[i + len(phrase) : i + context]
                 # Create the pretty lines with the query_word in the middle.
-                left_print = " ".join(left_context)[-half_width:]
-                right_print = " ".join(right_context)[:half_width]
+                left_print = cut_string(" ".join(left_context), -half_width).rjust(
+                    half_width
+                )
+                right_print = cut_string(" ".join(right_context), half_width)
                 # The WYSIWYG line of the concordance.
                 line_print = " ".join([left_print, query_word, right_print])
                 # Create the ConcordanceLine
diff --git a/nltk/util.py b/nltk/util.py
@@ -12,6 +12,7 @@
 import pydoc
 import re
 import textwrap
+import unicodedata
 import warnings
 from collections import defaultdict, deque
 from itertools import chain, combinations, islice, tee
@@ -139,6 +140,41 @@ def tokenwrap(tokens, separator=" ", width=70):
     return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
 
 
+def cut_string(s, width=70):
+    """
+    Cut off and return a given width of a string
+
+    Return the same as s[:width] if width >= 0 or s[-width:] if
+    width < 0, as long as s has no unicode combining characters.
+    If it has combining characters make sure the returned string's
+    visible width matches the called-for width.
+
+    :param s: the string to cut
+    :type s: str
+    :param width: the display_width
+    :type width: int
+    """
+    chars_sofar = 0
+    width_sofar = 0
+    result = ""
+
+    abs_width = abs(width)
+    max_chars = len(s)
+    while width_sofar < abs_width and chars_sofar < max_chars:
+        if width < 0:
+            char = s[-(chars_sofar + 1)]
+            result = char + result
+        else:
+            char = s[chars_sofar]
+            result = result + char
+
+        chars_sofar += 1
+        if not unicodedata.combining(char):
+            width_sofar += 1
+
+    return result
+
+
 ##########################################################################
 # Indexing
 ##########################################################################