Skip to content

Commit 7688393

Browse files
committed
*** empty log message ***
1 parent da8dd8a commit 7688393

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

tools/htmlmathml-f-ent-to-awk.pl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env perl
2+
#
3+
# Tool to create htmlmathml.awk from
4+
# https://www.w3.org/2003/entities/2007/htmlmathml-f.ent
5+
#
6+
# Expects lines such as
7+
#
8+
# <!ENTITY npart "&#x02202;&#x00338;" ><!--PARTIAL DIFFERENTIAL with slash -->
9+
#
10+
# The quoted replacement text contains text and numeric entities. If
11+
# the replacement text contains a character that would be an XML
12+
# delimiter, that delimiter is not written as "&#nnn;" but as
13+
# "&#38#nnn;". E.g., the replacement for "&lt;" would be "&#60;"
14+
# (which stands for "<"), but is written as "&#38;#60;".
15+
#
16+
# Each entity line creates an Awk array entry like
17+
#
18+
# htmlmathml_ent["part"] = "∂̸"
19+
20+
use warnings;
21+
use strict;
22+
use feature 'unicode_strings';
23+
24+
binmode(STDOUT, ":utf8");
25+
print "# Defines the array htmlmathml::ent with replacements for\n";
26+
print "# all named character entities of HTML.\n";
27+
print "# This file is in UTF-8.\n\n";
28+
print "\@namespace \"htmlmathml\"\n\n";
29+
print "BEGIN {\n";
30+
31+
while (<>) {
32+
if (/<!ENTITY\s+([a-zA-Z][a-zA-Z0-9.-]*)\s*"([^"]*)"/) {
33+
my ($n, $v) = ($1, $2); # Entity name and value
34+
35+
# Replace hexadecimal and decimal character entities. Some
36+
# replacements replace a name by another entity: (e.g., "lt" ->
37+
# "&#38;#60") because the actual character would not be legal XML.
38+
# We replace such entities as well.
39+
$v =~ s/&#(?:38;#)?(?:(x)([0-9a-f]+)|([0-9]+));/$1 ? chr(hex($2)) : chr($3)/gie;
40+
41+
# Replace control characters, quotes and backslashes by hexadecimal escapes.
42+
$v =~ s/[\x00-\x1F]|"|\\/sprintf("\\x%02x", ord($&))/ge;
43+
44+
print "ent[\"$n\"] = \"$v\"\n";
45+
}
46+
}
47+
48+
print "}\n";

0 commit comments

Comments
 (0)