Skip to content

Commit 5071ddc

Browse files
Nicolas Pitregregkh
authored andcommitted
vt: introduce gen_ucs_fallback_table.py to create ucs_fallback_table.h
The generated table maps complex characters to their simpler fallback forms for a terminal display when corresponding glyphs are unavailable. This includes diacritics, symbols as well as many drawing characters. Fallback characters aren't perfect replacements, obviously. But they are still far more useful than a bunch of squared question marks. Signed-off-by: Nicolas Pitre <[email protected]> Reviewed-by: Jiri Slaby <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent bb9a151 commit 5071ddc

File tree

1 file changed

+352
-0
lines changed

1 file changed

+352
-0
lines changed
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: GPL-2.0
3+
#
4+
# Leverage Python's unidecode module to generate ucs_fallback_table.h
5+
#
6+
# The generated table maps complex characters to their simpler fallback forms
7+
# for a terminal display when corresponding glyphs are unavailable.
8+
#
9+
# Usage:
10+
# python3 gen_ucs_fallback_table.py # Generate fallback tables
11+
# python3 gen_ucs_fallback_table.py -o FILE # Specify output file
12+
13+
import unicodedata
14+
from unidecode import unidecode
15+
import sys
16+
import argparse
17+
from collections import defaultdict
18+
19+
# Try to get unidecode version
20+
try:
21+
from importlib.metadata import version
22+
unidecode_version = version('unidecode')
23+
except:
24+
unidecode_version = 'unknown'
25+
26+
# This script's file name
27+
from pathlib import Path
28+
this_file = Path(__file__).name
29+
30+
# Default output file name
31+
DEFAULT_OUT_FILE = "ucs_fallback_table.h"
32+
33+
# Define the range marker value
34+
RANGE_MARKER = 0x00
35+
36+
def generate_fallback_map():
37+
"""Generate a fallback map using unidecode for all relevant Unicode points."""
38+
fallback_map = {}
39+
40+
# Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable
41+
for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F)
42+
char = chr(cp)
43+
44+
# Skip unassigned/control characters
45+
try:
46+
if not unicodedata.name(char, ''):
47+
continue
48+
except ValueError:
49+
continue
50+
51+
# Get the unidecode transliteration
52+
ascii_version = unidecode(char)
53+
54+
# Only store if it results in a single character mapping
55+
if len(ascii_version) == 1:
56+
fallback_map[cp] = ord(ascii_version)
57+
58+
# Apply manual overrides for special cases
59+
fallback_map.update(get_special_overrides())
60+
61+
return fallback_map
62+
63+
def get_special_overrides():
64+
"""Get special case overrides that need different handling than unidecode
65+
provides... or doesn't provide at all."""
66+
67+
overrides = {}
68+
69+
# Multi-character unidecode output
70+
# These map to single chars instead of unidecode's multiple-char mappings
71+
# In a terminal fallback context, we need a single character rather than multiple
72+
overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE")
73+
overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae")
74+
overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE")
75+
overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe")
76+
overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss")
77+
78+
# Comparison operators that unidecode renders as multiple characters
79+
overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=")
80+
overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=")
81+
82+
# Unidecode returns an empty string for these
83+
overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string)
84+
85+
# Quadrant block characters that unidecode doesn't map
86+
for cp in range(0x2596, 0x259F+1):
87+
overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string)
88+
89+
# Directional arrows
90+
# These provide better semantic meaning than unidecode's mappings
91+
overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-")
92+
overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-")
93+
overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|")
94+
overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|")
95+
96+
# Double arrows with their directional semantic mappings
97+
overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> <
98+
overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^
99+
overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> >
100+
overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v
101+
102+
# Halfwidth arrows
103+
# These need the same treatment as their normal-width counterparts
104+
overrides[0xFFE9] = ord('<') # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-")
105+
overrides[0xFFEA] = ord('^') # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|")
106+
overrides[0xFFEB] = ord('>') # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-")
107+
overrides[0xFFEC] = ord('v') # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|")
108+
109+
# Currency symbols - each mapped to a representative letter
110+
overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c
111+
overrides[0x00A3] = ord('L') # £ POUND SIGN -> L
112+
overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y
113+
overrides[0x20AC] = ord('E') # € EURO SIGN -> E
114+
115+
# Symbols mapped to letters
116+
overrides[0x00A7] = ord('S') # § SECTION SIGN -> S
117+
overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C
118+
overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R
119+
overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T
120+
121+
# Degree-related symbols
122+
overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o
123+
overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C
124+
overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F
125+
126+
# Angle quotation marks
127+
overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> <
128+
overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> >
129+
130+
# Operators with circular shape
131+
overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o
132+
overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> .
133+
134+
# Negated mathematical symbols (preserving the negation semantics)
135+
# Negated symbols mapped to exclamation mark (semantically "not")
136+
for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285):
137+
overrides[cp] = ord('!') # Negated math symbols -> ! (not)
138+
139+
# Negated symbols mapped to hash sign (semantically "not equal")
140+
for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B):
141+
overrides[cp] = ord('#') # Negated equality symbols -> # (not equal)
142+
143+
# Negated arrows - all mapped to exclamation mark
144+
for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF):
145+
overrides[cp] = ord('!') # Negated arrows -> ! (not)
146+
147+
# Dashes and hyphens
148+
for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052):
149+
overrides[cp] = ord('-') # Dashes and hyphens -> -
150+
151+
# Question mark punctuation
152+
for cp in (0x203D, 0x2047, 0x2048):
153+
overrides[cp] = ord('?') # Question marks -> ?
154+
155+
# Exclamation mark punctuation
156+
for cp in (0x203C, 0x2049):
157+
overrides[cp] = ord('!') # Exclamation marks -> !
158+
159+
# Asterisk-like symbols
160+
for cp in (0x2042, 0x2051, 0x2055):
161+
overrides[cp] = ord('*')
162+
163+
# Other specific punctuation with unique mappings
164+
overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK
165+
overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET
166+
overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS
167+
overrides[0x2033] = ord('"') # ″ DOUBLE PRIME
168+
overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN
169+
overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET
170+
overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET
171+
overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON
172+
overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK
173+
174+
# Check marks
175+
overrides[0x2713] = ord('v') # ✓ CHECK MARK
176+
overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK
177+
178+
# X marks - lowercase for regular, uppercase for heavy
179+
for cp in (0x2715, 0x2717):
180+
overrides[cp] = ord('x') # Regular X marks -> x
181+
for cp in (0x2716, 0x2718):
182+
overrides[cp] = ord('X') # Heavy X marks -> X
183+
184+
# Stars and asterisk-like symbols mapped to '*'
185+
for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698):
186+
overrides[cp] = ord('*') # All star and asterisk symbols -> *
187+
for cp in range(0x2721, 0x2746+1):
188+
overrides[cp] = ord('*') # All star and asterisk symbols -> *
189+
for cp in range(0x2749, 0x274B+1):
190+
overrides[cp] = ord('*') # Last set of asterisk symbols -> *
191+
for cp in (0x229B, 0x22C6, 0x235F, 0x2363):
192+
overrides[cp] = ord('*') # Star operators -> *
193+
194+
# Special exclusions with fallback value of 0
195+
# These will be filtered out in organize_by_pages()
196+
197+
# Exclude U+2028 (LINE SEPARATOR)
198+
overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n')
199+
200+
return overrides
201+
202+
def organize_by_pages(fallback_map):
203+
"""Organize the fallback mappings by their high byte (page)."""
204+
# Group by high byte (page)
205+
page_groups = defaultdict(list)
206+
for code, fallback in fallback_map.items():
207+
# Skip characters with fallback value of 0 (excluded characters)
208+
if fallback == 0:
209+
continue
210+
211+
page = code >> 8 # Get the high byte (page)
212+
offset = code & 0xFF # Get the low byte (offset within page)
213+
page_groups[page].append((offset, fallback))
214+
215+
# Sort each page's entries by offset
216+
for page in page_groups:
217+
page_groups[page].sort()
218+
219+
return page_groups
220+
221+
def compress_ranges(page_groups):
222+
"""Compress consecutive entries with the same fallback character into ranges.
223+
A range is only compressed if it contains 3 or more consecutive entries."""
224+
225+
compressed_pages = {}
226+
227+
for page, entries in page_groups.items():
228+
compressed_entries = []
229+
i = 0
230+
while i < len(entries):
231+
start_offset, fallback = entries[i]
232+
233+
# Look ahead to find consecutive entries with the same fallback
234+
j = i + 1
235+
while (j < len(entries) and
236+
entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets
237+
entries[j][1] == fallback): # same fallback
238+
j += 1
239+
240+
# Calculate the range end
241+
end_offset = entries[j-1][0]
242+
243+
# If we found a range with 3 or more entries (worth compressing)
244+
if j - i >= 3:
245+
# Add a range entry
246+
compressed_entries.append((start_offset, RANGE_MARKER))
247+
compressed_entries.append((end_offset, fallback))
248+
else:
249+
# Add the individual entries as is
250+
for k in range(i, j):
251+
compressed_entries.append(entries[k])
252+
253+
i = j
254+
255+
compressed_pages[page] = compressed_entries
256+
257+
return compressed_pages
258+
259+
def cp_name(cp):
260+
"""Get the Unicode character name for a code point."""
261+
try:
262+
return unicodedata.name(chr(cp))
263+
except:
264+
return f"U+{cp:04X}"
265+
266+
def generate_fallback_tables(out_file=DEFAULT_OUT_FILE):
267+
"""Generate the fallback character tables."""
268+
# Generate fallback map using unidecode
269+
fallback_map = generate_fallback_map()
270+
print(f"Generated {len(fallback_map)} total fallback mappings")
271+
272+
# Organize by pages
273+
page_groups = organize_by_pages(fallback_map)
274+
print(f"Organized into {len(page_groups)} pages")
275+
276+
# Compress ranges
277+
compressed_pages = compress_ranges(page_groups)
278+
total_compressed_entries = sum(len(entries) for entries in compressed_pages.values())
279+
print(f"Total compressed entries: {total_compressed_entries}")
280+
281+
# Create output file
282+
with open(out_file, 'w') as f:
283+
f.write(f"""\
284+
/* SPDX-License-Identifier: GPL-2.0 */
285+
/*
286+
* {out_file} - Unicode character fallback table
287+
*
288+
* Auto-generated by {this_file}
289+
*
290+
* Unicode Version: {unicodedata.unidata_version}
291+
* Unidecode Version: {unidecode_version}
292+
*
293+
* This file contains optimized tables that map complex Unicode characters
294+
* to simpler fallback characters for terminal display when corresponding
295+
* glyphs are unavailable.
296+
*/
297+
298+
static const struct ucs_page_desc ucs_fallback_pages[] = {{
299+
""")
300+
301+
# Convert compressed_pages to a sorted list of (page, entries) tuples
302+
sorted_pages = sorted(compressed_pages.items())
303+
304+
# Track the start index for each page
305+
start_index = 0
306+
307+
# Write page descriptors
308+
for page, entries in sorted_pages:
309+
count = len(entries)
310+
f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n")
311+
start_index += count
312+
313+
# Write entries array
314+
f.write("""\
315+
};
316+
317+
/* Page entries array (referenced by page descriptors) */
318+
static const struct ucs_page_entry ucs_fallback_entries[] = {
319+
""")
320+
321+
# Write all entries
322+
for page, entries in sorted_pages:
323+
page_hex = f"0x{page:02X}"
324+
f.write(f"\t/* Entries for page {page_hex} */\n")
325+
326+
for i, (offset, fallback) in enumerate(entries):
327+
# Convert to hex for better readability
328+
offset_hex = f"0x{offset:02X}"
329+
fallback_hex = f"0x{fallback:02X}"
330+
331+
# Handle comments
332+
codepoint = (page << 8) | offset
333+
334+
if fallback == RANGE_MARKER:
335+
comment = f"{cp_name(codepoint)} -> ..."
336+
else:
337+
comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'"
338+
f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n")
339+
340+
f.write(f"""\
341+
}};
342+
343+
#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER}
344+
""")
345+
346+
if __name__ == "__main__":
347+
parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables")
348+
parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
349+
help=f"Output file name (default: {DEFAULT_OUT_FILE})")
350+
args = parser.parse_args()
351+
352+
generate_fallback_tables(out_file=args.output_file)

0 commit comments

Comments
 (0)