Skip to content

Commit 5f69622

Browse files
authored
Merge pull request nltk#3172 from BroMattMiller/feature/Text-concordance-line-alignment
Align text.ConcordanceIndex.find_concordance()
2 parents 59a1dbc + ce6a0a2 commit 5f69622

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

AUTHORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
- David McClosky
129129
- Xinfan Meng
130130
- Dmitrijs Milajevs
131+
- Matt Miller
131132
- Margaret Mitchell
132133
- Tomonori Nagano
133134
- Jason Narad

nltk/text.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import re
1818
import sys
19+
import unicodedata
1920
from collections import Counter, defaultdict, namedtuple
2021
from functools import reduce
2122
from math import log
@@ -27,7 +28,7 @@
2728
from nltk.probability import ConditionalFreqDist as CFD
2829
from nltk.probability import FreqDist
2930
from nltk.tokenize import sent_tokenize
30-
from nltk.util import LazyConcatenation, tokenwrap
31+
from nltk.util import LazyConcatenation, cut_string, tokenwrap
3132

3233
ConcordanceLine = namedtuple(
3334
"ConcordanceLine",
@@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
193194
else:
194195
phrase = [word]
195196

196-
half_width = (width - len(" ".join(phrase)) - 2) // 2
197+
phrase_str = " ".join(phrase)
198+
phrase_len = sum(1 for char in phrase_str if not unicodedata.combining(char))
199+
half_width = (width - phrase_len - 2) // 2
197200
context = width // 4 # approx number of words of context
198201

199202
# Find the instances of the word to create the ConcordanceLine
@@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
209212
left_context = self._tokens[max(0, i - context) : i]
210213
right_context = self._tokens[i + len(phrase) : i + context]
211214
# Create the pretty lines with the query_word in the middle.
212-
left_print = " ".join(left_context)[-half_width:]
213-
right_print = " ".join(right_context)[:half_width]
215+
left_print = cut_string(" ".join(left_context), -half_width).rjust(
216+
half_width
217+
)
218+
right_print = cut_string(" ".join(right_context), half_width)
214219
# The WYSIWYG line of the concordance.
215220
line_print = " ".join([left_print, query_word, right_print])
216221
# Create the ConcordanceLine

nltk/util.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pydoc
1313
import re
1414
import textwrap
15+
import unicodedata
1516
import warnings
1617
from collections import defaultdict, deque
1718
from itertools import chain, combinations, islice, tee
@@ -139,6 +140,41 @@ def tokenwrap(tokens, separator=" ", width=70):
139140
return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
140141

141142

143+
def cut_string(s, width=70):
144+
"""
145+
Cut off and return a given width of a string
146+
147+
Return the same as s[:width] if width >= 0 or s[-width:] if
148+
width < 0, as long as s has no unicode combining characters.
149+
If it has combining characters make sure the returned string's
150+
visible width matches the called-for width.
151+
152+
:param s: the string to cut
153+
:type s: str
154+
:param width: the display_width
155+
:type width: int
156+
"""
157+
chars_sofar = 0
158+
width_sofar = 0
159+
result = ""
160+
161+
abs_width = abs(width)
162+
max_chars = len(s)
163+
while width_sofar < abs_width and chars_sofar < max_chars:
164+
if width < 0:
165+
char = s[-(chars_sofar + 1)]
166+
result = char + result
167+
else:
168+
char = s[chars_sofar]
169+
result = result + char
170+
171+
chars_sofar += 1
172+
if not unicodedata.combining(char):
173+
width_sofar += 1
174+
175+
return result
176+
177+
142178
##########################################################################
143179
# Indexing
144180
##########################################################################

0 commit comments

Comments
 (0)