1616
1717import re
1818import sys
19+ import unicodedata
1920from collections import Counter , defaultdict , namedtuple
2021from functools import reduce
2122from math import log
2728from nltk .probability import ConditionalFreqDist as CFD
2829from nltk .probability import FreqDist
2930from nltk .tokenize import sent_tokenize
30- from nltk .util import LazyConcatenation , tokenwrap
31+ from nltk .util import LazyConcatenation , cut_string , tokenwrap
3132
3233ConcordanceLine = namedtuple (
3334 "ConcordanceLine" ,
@@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
193194 else :
194195 phrase = [word ]
195196
196- half_width = (width - len (" " .join (phrase )) - 2 ) // 2
197+ phrase_str = " " .join (phrase )
198+ phrase_len = sum (1 for char in phrase_str if not unicodedata .combining (char ))
199+ half_width = (width - phrase_len - 2 ) // 2
197200 context = width // 4 # approx number of words of context
198201
199202 # Find the instances of the word to create the ConcordanceLine
@@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
209212 left_context = self ._tokens [max (0 , i - context ) : i ]
210213 right_context = self ._tokens [i + len (phrase ) : i + context ]
211214 # Create the pretty lines with the query_word in the middle.
212- left_print = " " .join (left_context )[- half_width :]
213- right_print = " " .join (right_context )[:half_width ]
215+ left_print = cut_string (" " .join (left_context ), - half_width ).rjust (
216+ half_width
217+ )
218+ right_print = cut_string (" " .join (right_context ), half_width )
214219 # The WYSIWYG line of the concordance.
215220 line_print = " " .join ([left_print , query_word , right_print ])
216221 # Create the ConcordanceLine
0 commit comments