Skip to content

Commit ac684a5

Browse files
committed
Improve Unicode support with grapheme clusters
**Problem**: asciimatics text utilities wrongly split up grapheme clusters (emoji ZWJ sequences like 👨‍👩‍👧gional flags like 🇨🇦skin tone modifiers, combining characters), causing display corruption and incorrect width calculations. **Solution**: Integrate with wcwidth >= 0.5.0 by using: - https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.iter_graphemes for iteration in _enforce_width_ext(), _find_min_start(), _get_offset() - https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.wrap for grapheme-aware word wrapping in _split_text() - https://wcwidth.readthedocs.io/en/latest/api.html#wcwidth.ljust for line padding in SpeechBubble I notice that there is a choice to "ignore unicode" for performance improvement, but I can suggest that wcwidth has many "fast path" checks for pure-ascii strings to return len(string) and so on, along with lru_cache, the performance is negligble to always support unicode.
1 parent 0a400e6 commit ac684a5

File tree

6 files changed

+397
-69
lines changed

6 files changed

+397
-69
lines changed

asciimatics/renderers/base.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,15 @@
55
from abc import ABCMeta, abstractmethod
66
import re
77
from typing import Callable, List, Optional, Tuple, Iterable
8-
from wcwidth.wcwidth import wcswidth
8+
from wcwidth import wcswidth
99
from asciimatics.screen import Screen, TemporaryCanvas
1010
from asciimatics.constants import COLOUR_REGEX
1111

12+
#: Type alias for colour tuples (foreground, attribute, background)
13+
Colour = Tuple[Optional[int], Optional[int], Optional[int]]
14+
#: Type alias for rendered text return type (text lines, colour map)
15+
RenderedText = Tuple[List[str], List[List[Colour]]]
16+
1217
#: Attribute conversion table for the ${c,a} form of attributes for
1318
#: :py:obj:`~.Screen.paint`.
1419
ATTRIBUTES = {
@@ -47,7 +52,7 @@ def max_width(self) -> int:
4752
@property
4853
@abstractmethod
4954
def rendered_text(
50-
self) -> Tuple[List[str], List[List[Tuple[Optional[int], Optional[int], Optional[int]]]]]:
55+
self) -> RenderedText:
5156
"""
5257
:return: The next image and colour map in the sequence as a tuple.
5358
"""
@@ -156,7 +161,7 @@ def images(self) -> Iterable[List[str]]:
156161

157162
@property
158163
def rendered_text(
159-
self) -> Tuple[List[str], List[List[Tuple[Optional[int], Optional[int], Optional[int]]]]]:
164+
self) -> RenderedText:
160165
"""
161166
:return: The next image and colour map in the sequence as a tuple.
162167
"""
@@ -253,7 +258,7 @@ def _colour_map(self) -> List[List[Tuple[Optional[int], Optional[int], Optional[
253258
return self._canvas.colour_map
254259

255260
@abstractmethod
256-
def _render_now(self) -> Tuple[List[str], List[List[Tuple[Optional[int], Optional[int], Optional[int]]]]]:
261+
def _render_now(self) -> RenderedText:
257262
"""
258263
Common method to render the latest image.
259264
@@ -264,7 +269,7 @@ def _render_now(self) -> Tuple[List[str], List[List[Tuple[Optional[int], Optiona
264269
@abstractmethod
265270
def _render_all(
266271
self
267-
) -> Iterable[Tuple[List[str], List[List[Tuple[Optional[int], Optional[int], Optional[int]]]]]]:
272+
) -> Iterable[RenderedText]:
268273
"""
269274
Generate all output.
270275
@@ -276,12 +281,13 @@ def _render_all(
276281

277282
@property
278283
def images(self) -> Iterable[List[str]]:
279-
# Attempt to get all images. Note that many are genuinely dynamic and so will only return one.
284+
# Attempt to get all images. Note that many are genuinely dynamic
285+
# and so will only return one.
280286
return [x[0] for x in self._render_all()]
281287

282288
@property
283289
def rendered_text(
284-
self) -> Tuple[List[str], List[List[Tuple[Optional[int], Optional[int], Optional[int]]]]]:
290+
self) -> RenderedText:
285291
if self._must_clear:
286292
self._clear()
287293
return self._render_now()

asciimatics/renderers/speechbubble.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
This module implements a speech-bubble effect renderer.
33
"""
44
from typing import Optional, Union
5-
from wcwidth.wcwidth import wcswidth
5+
from wcwidth import wcswidth, ljust
66
from asciimatics.renderers.base import StaticRenderer, Renderer
77

88

@@ -25,14 +25,12 @@ def __init__(self, text: Union[str, Renderer], tail: Optional[str] = None, uni:
2525
if uni:
2626
bubble = "╭─" + "─" * max_len + "─╮\n"
2727
for line in text_list:
28-
filler = " " * (max_len - len(line))
29-
bubble += "│ " + line + filler + " │\n"
28+
bubble += "│ " + ljust(line, max_len) + " │\n"
3029
bubble += "╰─" + "─" * max_len + "─╯"
3130
else:
3231
bubble = ".-" + "-" * max_len + "-.\n"
3332
for line in text_list:
34-
filler = " " * (max_len - len(line))
35-
bubble += "| " + line + filler + " |\n"
33+
bubble += "| " + ljust(line, max_len) + " |\n"
3634
bubble += "`-" + "-" * max_len + "-`"
3735
if tail == "L":
3836
bubble += "\n"

asciimatics/widgets/utilities.py

Lines changed: 73 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from collections import defaultdict
66
from functools import lru_cache
77
from typing import TYPE_CHECKING, List, Tuple, Optional, Union
8-
from wcwidth import wcswidth, wcwidth
8+
from wcwidth import wcswidth, iter_graphemes, wrap as wcwidth_wrap
99
from asciimatics.screen import Screen
1010
if TYPE_CHECKING:
1111
from asciimatics.widgets.widget import Widget
@@ -123,7 +123,8 @@ def _enforce_width(text: Union[str, ColouredText],
123123
:param split_on_words: Whether to respect word boundaries when splitting.
124124
:return: The resulting truncated text
125125
"""
126-
return _enforce_width_ext(text, width, unicode_aware=unicode_aware, split_on_words=split_on_words)[0]
126+
return _enforce_width_ext(
127+
text, width, unicode_aware=unicode_aware, split_on_words=split_on_words)[0]
127128

128129

129130
def _enforce_width_ext(text: Union[str, ColouredText],
@@ -132,7 +133,7 @@ def _enforce_width_ext(text: Union[str, ColouredText],
132133
split_on_words: bool = False) -> Tuple[Union[str, ColouredText], bool]:
133134
"""
134135
Enforce a displayed piece of text to be a certain number of cells wide. This takes into
135-
account double-width characters used in CJK languages.
136+
account double-width characters used in CJK languages and grapheme clusters.
136137
137138
:param text: The text to be truncated
138139
:param width: The screen cell width to enforce
@@ -148,20 +149,24 @@ def _enforce_width_ext(text: Union[str, ColouredText],
148149
# Can still optimize performance if we are not handling unicode characters.
149150
if unicode_aware or split_on_words:
150151
size = 0
151-
last_space = 9999999999
152-
for i, char in enumerate(str(text)):
153-
c_width = wcwidth(char) if ord(char) >= 256 else 1
154-
if split_on_words and char in (" ", "\t"):
155-
last_space = i + 1
156-
if size + c_width > width:
157-
return text[0:min(i, last_space)], True
158-
size += c_width
152+
pos = 0
153+
last_space_pos = 9999999999
154+
text_str = str(text)
155+
for grapheme in iter_graphemes(text_str):
156+
g_width = wcswidth(grapheme)
157+
if split_on_words and grapheme in (" ", "\t"):
158+
last_space_pos = pos + len(grapheme)
159+
if size + g_width > width:
160+
return text[0:min(pos, last_space_pos)], True
161+
size += g_width
162+
pos += len(grapheme)
159163
elif len(text) + 1 > width:
160164
return text[0:width], True
161165
return text, False
162166

163167

164-
def _find_min_start(text: str, max_width: int, unicode_aware: bool = True, at_end: bool = False) -> int:
168+
def _find_min_start(text: str, max_width: int, unicode_aware: bool = True,
169+
at_end: bool = False) -> int:
165170
"""
166171
Find the starting point in the string that will reduce it to be less than or equal to the
167172
specified width when displayed on screen.
@@ -176,24 +181,29 @@ def _find_min_start(text: str, max_width: int, unicode_aware: bool = True, at_en
176181
if 2 * len(text) < max_width:
177182
return 0
178183

179-
# OK - do it the hard way...
184+
# OK - do it the hard way, iterating by grapheme cluster to avoid splitting them...
180185
result = 0
181-
string_len = wcswidth if unicode_aware else len
182-
char_len = wcwidth if unicode_aware else lambda x: 1
183-
display_end = string_len(text)
184-
while display_end > max_width:
185-
result += 1
186-
display_end -= char_len(text[0])
187-
text = text[1:]
186+
if unicode_aware:
187+
display_end = wcswidth(text)
188+
for grapheme in iter_graphemes(text):
189+
if display_end <= max_width:
190+
break
191+
display_end -= wcswidth(grapheme)
192+
result += len(grapheme)
193+
else:
194+
display_end = len(text)
195+
while display_end > max_width:
196+
result += 1
197+
display_end -= 1
188198
if at_end and display_end == max_width:
189-
result += 1
199+
result += len(next(iter_graphemes(text[result:]), "")) if unicode_aware else 1
190200
return result
191201

192202

193203
def _get_offset(text: str, visible_width: int, unicode_aware: bool = True) -> int:
194204
"""
195205
Find the character offset within some text for a given visible offset (taking into account the
196-
fact that some character glyphs are double width).
206+
fact that some character glyphs are double width and grapheme clusters).
197207
198208
:param text: The text to analyze
199209
:param visible_width: The required location within that text (as seen on screen).
@@ -202,13 +212,12 @@ def _get_offset(text: str, visible_width: int, unicode_aware: bool = True) -> in
202212
result = 0
203213
width = 0
204214
if unicode_aware:
205-
for char in text:
206-
if visible_width - width <= 0:
215+
for grapheme in iter_graphemes(text):
216+
g_width = wcswidth(grapheme)
217+
if width + g_width > visible_width:
207218
break
208-
result += 1
209-
width += wcwidth(char)
210-
if visible_width - width < 0:
211-
result -= 1
219+
result += len(grapheme)
220+
width += g_width
212221
else:
213222
result = min(len(text), visible_width)
214223
return result
@@ -227,42 +236,51 @@ def _split_text(text: str, width: int, height: int, unicode_aware: bool = True)
227236
:param height: The maximum height for the resulting text.
228237
:return: A list of strings of the broken up text.
229238
"""
230-
# At a high level, just try to split on whitespace for the best results.
231-
tokens = text.split(" ")
232-
result = []
233-
current_line = ""
234239
string_len = wcswidth if unicode_aware else len
235-
for token in tokens:
236-
for i, line_token in enumerate(token.split("\n")):
237-
if string_len(current_line + line_token) > width or i > 0:
238-
# Don't bother inserting completely blank lines
239-
# which should only happen on the very first
240-
# line (as the rest will inject whitespace/newlines)
241-
if len(current_line) > 0:
242-
result.append(current_line.rstrip())
243-
current_line = line_token + " "
244-
else:
245-
current_line += line_token + " "
246240

247-
# At this point we've either split nicely or have a hugely long unbroken string
248-
# (e.g. because the language doesn't use whitespace.
249-
# Either way, break this last line up as best we can.
250-
current_line = current_line.rstrip()
251-
while string_len(current_line) > 0:
252-
new_line = str(_enforce_width(current_line, width, unicode_aware))
253-
result.append(new_line)
254-
current_line = current_line[len(new_line):]
241+
if unicode_aware:
242+
# Use wcwidth.wrap() for grapheme, east-asian, emoji, and terminal sequence-aware word
243+
# wrapping, modeled after standard python textwrap.wrap(). We split on newlines first to
244+
# preserve newlines, as documented at bottom of API document of wcwidth.wrap(), its what
245+
# most people prefer, like the html classic '<br><br><br>' sometimes you want them.
246+
result = []
247+
for paragraph in text.split("\n"):
248+
if paragraph:
249+
result.extend(wcwidth_wrap(paragraph, width, break_long_words=True))
250+
else:
251+
result.append("")
252+
else:
253+
# Legacy non-unicode path
254+
tokens = text.split(" ")
255+
result = []
256+
current_line = ""
257+
for token in tokens:
258+
for i, line_token in enumerate(token.split("\n")):
259+
if len(current_line + line_token) > width or i > 0:
260+
if len(current_line) > 0:
261+
result.append(current_line.rstrip())
262+
current_line = line_token + " "
263+
else:
264+
current_line += line_token + " "
265+
current_line = current_line.rstrip()
266+
while len(current_line) > 0:
267+
new_line = current_line[:width]
268+
result.append(new_line)
269+
current_line = current_line[len(new_line):]
255270

256-
# Check for a height overrun and truncate.
271+
# Check for a height overrun and truncate with ellipsis.
257272
if len(result) > height:
258273
result = result[:height]
259-
result[height - 1] = result[height - 1][:width - 3] + "..."
274+
last_line = result[height - 1]
275+
truncated = _enforce_width(last_line, width - 3, unicode_aware)
276+
result[height - 1] = str(truncated) + "..."
260277

261278
# Very small columns could be shorter than individual words - truncate
262279
# each line if necessary.
263280
for i, line in enumerate(result):
264-
if len(line) > width:
265-
result[i] = line[:width - 3] + "..."
281+
if string_len(line) > width:
282+
truncated = _enforce_width(line, width - 3, unicode_aware)
283+
result[i] = str(truncated) + "..."
266284
return result
267285

268286

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ keywords = [
5454
dependencies = [
5555
'pyfiglet >= 0.7.2',
5656
'Pillow >= 2.7.0',
57-
'wcwidth',
57+
'wcwidth >= 0.5.0',
5858
"pywin32 >= 1.0; platform_system=='Windows'",
5959
]
6060
requires-python = ">= 3.8"

requirements/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
wcwidth
1+
wcwidth >= 0.5
22
pyfiglet >= 0.7.2
33
setuptools_scm

0 commit comments

Comments
 (0)