|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# SPDX-License-Identifier: GPL-2.0 |
| 3 | +# |
| 4 | +# Leverage Python's unidecode module to generate ucs_fallback_table.h |
| 5 | +# |
| 6 | +# The generated table maps complex characters to their simpler fallback forms |
| 7 | +# for a terminal display when corresponding glyphs are unavailable. |
| 8 | +# |
| 9 | +# Usage: |
| 10 | +# python3 gen_ucs_fallback_table.py # Generate fallback tables |
| 11 | +# python3 gen_ucs_fallback_table.py -o FILE # Specify output file |
| 12 | + |
| 13 | +import unicodedata |
| 14 | +from unidecode import unidecode |
| 15 | +import sys |
| 16 | +import argparse |
| 17 | +from collections import defaultdict |
| 18 | + |
| 19 | +# Try to get unidecode version |
| 20 | +try: |
| 21 | + from importlib.metadata import version |
| 22 | + unidecode_version = version('unidecode') |
| 23 | +except: |
| 24 | + unidecode_version = 'unknown' |
| 25 | + |
| 26 | +# This script's file name |
| 27 | +from pathlib import Path |
| 28 | +this_file = Path(__file__).name |
| 29 | + |
| 30 | +# Default output file name |
| 31 | +DEFAULT_OUT_FILE = "ucs_fallback_table.h" |
| 32 | + |
| 33 | +# Define the range marker value |
| 34 | +RANGE_MARKER = 0x00 |
| 35 | + |
| 36 | +def generate_fallback_map(): |
| 37 | + """Generate a fallback map using unidecode for all relevant Unicode points.""" |
| 38 | + fallback_map = {} |
| 39 | + |
| 40 | + # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable |
| 41 | + for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F) |
| 42 | + char = chr(cp) |
| 43 | + |
| 44 | + # Skip unassigned/control characters |
| 45 | + try: |
| 46 | + if not unicodedata.name(char, ''): |
| 47 | + continue |
| 48 | + except ValueError: |
| 49 | + continue |
| 50 | + |
| 51 | + # Get the unidecode transliteration |
| 52 | + ascii_version = unidecode(char) |
| 53 | + |
| 54 | + # Only store if it results in a single character mapping |
| 55 | + if len(ascii_version) == 1: |
| 56 | + fallback_map[cp] = ord(ascii_version) |
| 57 | + |
| 58 | + # Apply manual overrides for special cases |
| 59 | + fallback_map.update(get_special_overrides()) |
| 60 | + |
| 61 | + return fallback_map |
| 62 | + |
| 63 | +def get_special_overrides(): |
| 64 | + """Get special case overrides that need different handling than unidecode |
| 65 | + provides... or doesn't provide at all.""" |
| 66 | + |
| 67 | + overrides = {} |
| 68 | + |
| 69 | + # Multi-character unidecode output |
| 70 | + # These map to single chars instead of unidecode's multiple-char mappings |
| 71 | + # In a terminal fallback context, we need a single character rather than multiple |
| 72 | + overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE") |
| 73 | + overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae") |
| 74 | + overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE") |
| 75 | + overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe") |
| 76 | + overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss") |
| 77 | + |
| 78 | + # Comparison operators that unidecode renders as multiple characters |
| 79 | + overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=") |
| 80 | + overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=") |
| 81 | + |
| 82 | + # Unidecode returns an empty string for these |
| 83 | + overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string) |
| 84 | + |
| 85 | + # Quadrant block characters that unidecode doesn't map |
| 86 | + for cp in range(0x2596, 0x259F+1): |
| 87 | + overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string) |
| 88 | + |
| 89 | + # Directional arrows |
| 90 | + # These provide better semantic meaning than unidecode's mappings |
| 91 | + overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-") |
| 92 | + overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-") |
| 93 | + overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|") |
| 94 | + overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|") |
| 95 | + |
| 96 | + # Double arrows with their directional semantic mappings |
| 97 | + overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> < |
| 98 | + overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^ |
| 99 | + overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> > |
| 100 | + overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v |
| 101 | + |
| 102 | + # Halfwidth arrows |
| 103 | + # These need the same treatment as their normal-width counterparts |
| 104 | + overrides[0xFFE9] = ord('<') # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-") |
| 105 | + overrides[0xFFEA] = ord('^') # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|") |
| 106 | + overrides[0xFFEB] = ord('>') # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-") |
| 107 | + overrides[0xFFEC] = ord('v') # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|") |
| 108 | + |
| 109 | + # Currency symbols - each mapped to a representative letter |
| 110 | + overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c |
| 111 | + overrides[0x00A3] = ord('L') # £ POUND SIGN -> L |
| 112 | + overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y |
| 113 | + overrides[0x20AC] = ord('E') # € EURO SIGN -> E |
| 114 | + |
| 115 | + # Symbols mapped to letters |
| 116 | + overrides[0x00A7] = ord('S') # § SECTION SIGN -> S |
| 117 | + overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C |
| 118 | + overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R |
| 119 | + overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T |
| 120 | + |
| 121 | + # Degree-related symbols |
| 122 | + overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o |
| 123 | + overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C |
| 124 | + overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F |
| 125 | + |
| 126 | + # Angle quotation marks |
| 127 | + overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> < |
| 128 | + overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> > |
| 129 | + |
| 130 | + # Operators with circular shape |
| 131 | + overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o |
| 132 | + overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> . |
| 133 | + |
| 134 | + # Negated mathematical symbols (preserving the negation semantics) |
| 135 | + # Negated symbols mapped to exclamation mark (semantically "not") |
| 136 | + for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285): |
| 137 | + overrides[cp] = ord('!') # Negated math symbols -> ! (not) |
| 138 | + |
| 139 | + # Negated symbols mapped to hash sign (semantically "not equal") |
| 140 | + for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B): |
| 141 | + overrides[cp] = ord('#') # Negated equality symbols -> # (not equal) |
| 142 | + |
| 143 | + # Negated arrows - all mapped to exclamation mark |
| 144 | + for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF): |
| 145 | + overrides[cp] = ord('!') # Negated arrows -> ! (not) |
| 146 | + |
| 147 | + # Dashes and hyphens |
| 148 | + for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052): |
| 149 | + overrides[cp] = ord('-') # Dashes and hyphens -> - |
| 150 | + |
| 151 | + # Question mark punctuation |
| 152 | + for cp in (0x203D, 0x2047, 0x2048): |
| 153 | + overrides[cp] = ord('?') # Question marks -> ? |
| 154 | + |
| 155 | + # Exclamation mark punctuation |
| 156 | + for cp in (0x203C, 0x2049): |
| 157 | + overrides[cp] = ord('!') # Exclamation marks -> ! |
| 158 | + |
| 159 | + # Asterisk-like symbols |
| 160 | + for cp in (0x2042, 0x2051, 0x2055): |
| 161 | + overrides[cp] = ord('*') |
| 162 | + |
| 163 | + # Other specific punctuation with unique mappings |
| 164 | + overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK |
| 165 | + overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET |
| 166 | + overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS |
| 167 | + overrides[0x2033] = ord('"') # ″ DOUBLE PRIME |
| 168 | + overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN |
| 169 | + overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET |
| 170 | + overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET |
| 171 | + overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON |
| 172 | + overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK |
| 173 | + |
| 174 | + # Check marks |
| 175 | + overrides[0x2713] = ord('v') # ✓ CHECK MARK |
| 176 | + overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK |
| 177 | + |
| 178 | + # X marks - lowercase for regular, uppercase for heavy |
| 179 | + for cp in (0x2715, 0x2717): |
| 180 | + overrides[cp] = ord('x') # Regular X marks -> x |
| 181 | + for cp in (0x2716, 0x2718): |
| 182 | + overrides[cp] = ord('X') # Heavy X marks -> X |
| 183 | + |
| 184 | + # Stars and asterisk-like symbols mapped to '*' |
| 185 | + for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698): |
| 186 | + overrides[cp] = ord('*') # All star and asterisk symbols -> * |
| 187 | + for cp in range(0x2721, 0x2746+1): |
| 188 | + overrides[cp] = ord('*') # All star and asterisk symbols -> * |
| 189 | + for cp in range(0x2749, 0x274B+1): |
| 190 | + overrides[cp] = ord('*') # Last set of asterisk symbols -> * |
| 191 | + for cp in (0x229B, 0x22C6, 0x235F, 0x2363): |
| 192 | + overrides[cp] = ord('*') # Star operators -> * |
| 193 | + |
| 194 | + # Special exclusions with fallback value of 0 |
| 195 | + # These will be filtered out in organize_by_pages() |
| 196 | + |
| 197 | + # Exclude U+2028 (LINE SEPARATOR) |
| 198 | + overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n') |
| 199 | + |
| 200 | + return overrides |
| 201 | + |
| 202 | +def organize_by_pages(fallback_map): |
| 203 | + """Organize the fallback mappings by their high byte (page).""" |
| 204 | + # Group by high byte (page) |
| 205 | + page_groups = defaultdict(list) |
| 206 | + for code, fallback in fallback_map.items(): |
| 207 | + # Skip characters with fallback value of 0 (excluded characters) |
| 208 | + if fallback == 0: |
| 209 | + continue |
| 210 | + |
| 211 | + page = code >> 8 # Get the high byte (page) |
| 212 | + offset = code & 0xFF # Get the low byte (offset within page) |
| 213 | + page_groups[page].append((offset, fallback)) |
| 214 | + |
| 215 | + # Sort each page's entries by offset |
| 216 | + for page in page_groups: |
| 217 | + page_groups[page].sort() |
| 218 | + |
| 219 | + return page_groups |
| 220 | + |
| 221 | +def compress_ranges(page_groups): |
| 222 | + """Compress consecutive entries with the same fallback character into ranges. |
| 223 | + A range is only compressed if it contains 3 or more consecutive entries.""" |
| 224 | + |
| 225 | + compressed_pages = {} |
| 226 | + |
| 227 | + for page, entries in page_groups.items(): |
| 228 | + compressed_entries = [] |
| 229 | + i = 0 |
| 230 | + while i < len(entries): |
| 231 | + start_offset, fallback = entries[i] |
| 232 | + |
| 233 | + # Look ahead to find consecutive entries with the same fallback |
| 234 | + j = i + 1 |
| 235 | + while (j < len(entries) and |
| 236 | + entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets |
| 237 | + entries[j][1] == fallback): # same fallback |
| 238 | + j += 1 |
| 239 | + |
| 240 | + # Calculate the range end |
| 241 | + end_offset = entries[j-1][0] |
| 242 | + |
| 243 | + # If we found a range with 3 or more entries (worth compressing) |
| 244 | + if j - i >= 3: |
| 245 | + # Add a range entry |
| 246 | + compressed_entries.append((start_offset, RANGE_MARKER)) |
| 247 | + compressed_entries.append((end_offset, fallback)) |
| 248 | + else: |
| 249 | + # Add the individual entries as is |
| 250 | + for k in range(i, j): |
| 251 | + compressed_entries.append(entries[k]) |
| 252 | + |
| 253 | + i = j |
| 254 | + |
| 255 | + compressed_pages[page] = compressed_entries |
| 256 | + |
| 257 | + return compressed_pages |
| 258 | + |
| 259 | +def cp_name(cp): |
| 260 | + """Get the Unicode character name for a code point.""" |
| 261 | + try: |
| 262 | + return unicodedata.name(chr(cp)) |
| 263 | + except: |
| 264 | + return f"U+{cp:04X}" |
| 265 | + |
| 266 | +def generate_fallback_tables(out_file=DEFAULT_OUT_FILE): |
| 267 | + """Generate the fallback character tables.""" |
| 268 | + # Generate fallback map using unidecode |
| 269 | + fallback_map = generate_fallback_map() |
| 270 | + print(f"Generated {len(fallback_map)} total fallback mappings") |
| 271 | + |
| 272 | + # Organize by pages |
| 273 | + page_groups = organize_by_pages(fallback_map) |
| 274 | + print(f"Organized into {len(page_groups)} pages") |
| 275 | + |
| 276 | + # Compress ranges |
| 277 | + compressed_pages = compress_ranges(page_groups) |
| 278 | + total_compressed_entries = sum(len(entries) for entries in compressed_pages.values()) |
| 279 | + print(f"Total compressed entries: {total_compressed_entries}") |
| 280 | + |
| 281 | + # Create output file |
| 282 | + with open(out_file, 'w') as f: |
| 283 | + f.write(f"""\ |
| 284 | +/* SPDX-License-Identifier: GPL-2.0 */ |
| 285 | +/* |
| 286 | + * {out_file} - Unicode character fallback table |
| 287 | + * |
| 288 | + * Auto-generated by {this_file} |
| 289 | + * |
| 290 | + * Unicode Version: {unicodedata.unidata_version} |
| 291 | + * Unidecode Version: {unidecode_version} |
| 292 | + * |
| 293 | + * This file contains optimized tables that map complex Unicode characters |
| 294 | + * to simpler fallback characters for terminal display when corresponding |
| 295 | + * glyphs are unavailable. |
| 296 | + */ |
| 297 | +
|
| 298 | +static const struct ucs_page_desc ucs_fallback_pages[] = {{ |
| 299 | +""") |
| 300 | + |
| 301 | + # Convert compressed_pages to a sorted list of (page, entries) tuples |
| 302 | + sorted_pages = sorted(compressed_pages.items()) |
| 303 | + |
| 304 | + # Track the start index for each page |
| 305 | + start_index = 0 |
| 306 | + |
| 307 | + # Write page descriptors |
| 308 | + for page, entries in sorted_pages: |
| 309 | + count = len(entries) |
| 310 | + f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n") |
| 311 | + start_index += count |
| 312 | + |
| 313 | + # Write entries array |
| 314 | + f.write("""\ |
| 315 | +}; |
| 316 | +
|
| 317 | +/* Page entries array (referenced by page descriptors) */ |
| 318 | +static const struct ucs_page_entry ucs_fallback_entries[] = { |
| 319 | +""") |
| 320 | + |
| 321 | + # Write all entries |
| 322 | + for page, entries in sorted_pages: |
| 323 | + page_hex = f"0x{page:02X}" |
| 324 | + f.write(f"\t/* Entries for page {page_hex} */\n") |
| 325 | + |
| 326 | + for i, (offset, fallback) in enumerate(entries): |
| 327 | + # Convert to hex for better readability |
| 328 | + offset_hex = f"0x{offset:02X}" |
| 329 | + fallback_hex = f"0x{fallback:02X}" |
| 330 | + |
| 331 | + # Handle comments |
| 332 | + codepoint = (page << 8) | offset |
| 333 | + |
| 334 | + if fallback == RANGE_MARKER: |
| 335 | + comment = f"{cp_name(codepoint)} -> ..." |
| 336 | + else: |
| 337 | + comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'" |
| 338 | + f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n") |
| 339 | + |
| 340 | + f.write(f"""\ |
| 341 | +}}; |
| 342 | +
|
| 343 | +#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER} |
| 344 | +""") |
| 345 | + |
| 346 | +if __name__ == "__main__": |
| 347 | + parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables") |
| 348 | + parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, |
| 349 | + help=f"Output file name (default: {DEFAULT_OUT_FILE})") |
| 350 | + args = parser.parse_args() |
| 351 | + |
| 352 | + generate_fallback_tables(out_file=args.output_file) |
0 commit comments