|
| 1 | +#!/usr/bin/env perl |
| 2 | +# |
| 3 | +# Tool to create htmlmathml.awk from |
| 4 | +# https://www.w3.org/2003/entities/2007/htmlmathml-f.ent |
| 5 | +# |
| 6 | +# Expects lines such as |
| 7 | +# |
| 8 | +# <!ENTITY npart "∂̸" ><!--PARTIAL DIFFERENTIAL with slash --> |
| 9 | +# |
| 10 | +# The quoted replacement text contains text and numeric entities. If |
| 11 | +# the replacement text contains a character that would be an XML |
| 12 | +# delimiter, that delimiter is not written as "&#nnn;" but as |
| 13 | +# "&#nnn;". E.g., the replacement for "<" would be "<" |
| 14 | +# (which stands for "<"), but is written as "&#60;". |
| 15 | +# |
| 16 | +# Each entity line creates an Awk array entry like |
| 17 | +# |
| 18 | +# htmlmathml_ent["part"] = "∂̸" |
| 19 | + |
| 20 | +use warnings; |
| 21 | +use strict; |
| 22 | +use feature 'unicode_strings'; |
| 23 | + |
| 24 | +binmode(STDOUT, ":utf8"); |
| 25 | +print "# Defines the array htmlmathml::ent with replacements for\n"; |
| 26 | +print "# all named character entities of HTML.\n"; |
| 27 | +print "# This file is in UTF-8.\n\n"; |
| 28 | +print "\@namespace \"htmlmathml\"\n\n"; |
| 29 | +print "BEGIN {\n"; |
| 30 | + |
| 31 | +while (<>) { |
| 32 | + if (/<!ENTITY\s+([a-zA-Z][a-zA-Z0-9.-]*)\s*"([^"]*)"/) { |
| 33 | + my ($n, $v) = ($1, $2); # Entity name and value |
| 34 | + |
| 35 | + # Replace hexadecimal and decimal character entities. Some |
| 36 | + # replacements replace a name by another entity: (e.g., "lt" -> |
| 37 | + # "&#60") because the actual character would not be legal XML. |
| 38 | + # We replace such entities as well. |
| 39 | + $v =~ s/&#(?:38;#)?(?:(x)([0-9a-f]+)|([0-9]+));/$1 ? chr(hex($2)) : chr($3)/gie; |
| 40 | + |
| 41 | + # Replace control characters, quotes and backslashes by hexadecimal escapes. |
| 42 | + $v =~ s/[\x00-\x1F]|"|\\/sprintf("\\x%02x", ord($&))/ge; |
| 43 | + |
| 44 | + print "ent[\"$n\"] = \"$v\"\n"; |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +print "}\n"; |
0 commit comments