Skip to content

Commit ce6a0a2

Browse files
committed
Align text.ConcordanceIndex.find_concordance()
There were two cases where the alignment of text.ConcordanceIndex.find_concordance() output was broken: * The left context of the query was very small. That is, the query matched very close to the beginning of the text. * The text or query contained Unicode combining characters. The first case is addressed by right justifying the left context to the proper width. The second case is addressed by replacing the standard Python len(s) with a count that excludes combining characters, and replacing s[:width] and s[-width:] with a new utility function that accounts for combining characters.
1 parent 582e6e3 commit ce6a0a2

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

AUTHORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
- David McClosky
128128
- Xinfan Meng
129129
- Dmitrijs Milajevs
130+
- Matt Miller
130131
- Margaret Mitchell
131132
- Tomonori Nagano
132133
- Jason Narad

nltk/text.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import re
1818
import sys
19+
import unicodedata
1920
from collections import Counter, defaultdict, namedtuple
2021
from functools import reduce
2122
from math import log
@@ -27,7 +28,7 @@
2728
from nltk.probability import ConditionalFreqDist as CFD
2829
from nltk.probability import FreqDist
2930
from nltk.tokenize import sent_tokenize
30-
from nltk.util import LazyConcatenation, tokenwrap
31+
from nltk.util import LazyConcatenation, cut_string, tokenwrap
3132

3233
ConcordanceLine = namedtuple(
3334
"ConcordanceLine",
@@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
193194
else:
194195
phrase = [word]
195196

196-
half_width = (width - len(" ".join(phrase)) - 2) // 2
197+
phrase_str = " ".join(phrase)
198+
phrase_len = sum(1 for char in phrase_str if not unicodedata.combining(char))
199+
half_width = (width - phrase_len - 2) // 2
197200
context = width // 4 # approx number of words of context
198201

199202
# Find the instances of the word to create the ConcordanceLine
@@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
209212
left_context = self._tokens[max(0, i - context) : i]
210213
right_context = self._tokens[i + len(phrase) : i + context]
211214
# Create the pretty lines with the query_word in the middle.
212-
left_print = " ".join(left_context)[-half_width:]
213-
right_print = " ".join(right_context)[:half_width]
215+
left_print = cut_string(" ".join(left_context), -half_width).rjust(
216+
half_width
217+
)
218+
right_print = cut_string(" ".join(right_context), half_width)
214219
# The WYSIWYG line of the concordance.
215220
line_print = " ".join([left_print, query_word, right_print])
216221
# Create the ConcordanceLine

nltk/util.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pydoc
1313
import re
1414
import textwrap
15+
import unicodedata
1516
import warnings
1617
from collections import defaultdict, deque
1718
from itertools import chain, combinations, islice, tee
@@ -139,6 +140,41 @@ def tokenwrap(tokens, separator=" ", width=70):
139140
return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
140141

141142

143+
def cut_string(s, width=70):
144+
"""
145+
Cut off and return a given width of a string
146+
147+
Return the same as s[:width] if width >= 0 or s[-width:] if
148+
width < 0, as long as s has no unicode combining characters.
149+
If it has combining characters make sure the returned string's
150+
visible width matches the called-for width.
151+
152+
:param s: the string to cut
153+
:type s: str
154+
:param width: the display_width
155+
:type width: int
156+
"""
157+
chars_sofar = 0
158+
width_sofar = 0
159+
result = ""
160+
161+
abs_width = abs(width)
162+
max_chars = len(s)
163+
while width_sofar < abs_width and chars_sofar < max_chars:
164+
if width < 0:
165+
char = s[-(chars_sofar + 1)]
166+
result = char + result
167+
else:
168+
char = s[chars_sofar]
169+
result = result + char
170+
171+
chars_sofar += 1
172+
if not unicodedata.combining(char):
173+
width_sofar += 1
174+
175+
return result
176+
177+
142178
##########################################################################
143179
# Indexing
144180
##########################################################################

0 commit comments

Comments
 (0)