1919# A simple python script to generate an HTML entity map and a regex alternation
2020# for inclusion in HTMLStripCharFilter.jflex.
2121
22+
2223def main ():
23- with open (sys .argv [1 ], 'w' ) as f :
24- sys .stdout = f
25-
26- print (get_apache_license ())
27- codes = {}
28- regex = re .compile (r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"' )
29- for line in get_entity_text ().split ('\n ' ):
30- match = regex .match (line )
31- if match :
32- key = match .group (1 )
33- if key == 'quot' : codes [key ] = r'\"'
34- elif key == 'nbsp' : codes [key ] = ' ' ;
35- else : codes [key ] = r'\u%04X' % int (match .group (2 ))
36-
37- keys = sorted (codes )
38-
39- first_entry = True
40- output_line = 'CharacterEntities = ( '
41- for key in keys :
42- new_entry = ('"%s"' if first_entry else ' | "%s"' ) % key
43- first_entry = False
44- if len (output_line ) + len (new_entry ) >= 80 :
45- print (output_line )
46- output_line = ' '
47- output_line += new_entry
48- if key in ('quot' ,'copy' ,'gt' ,'lt' ,'reg' ,'amp' ):
49- new_entry = ' | "%s"' % key .upper ()
50- if len (output_line ) + len (new_entry ) >= 80 :
51- print (output_line )
52- output_line = ' '
53- output_line += new_entry
54- print (output_line , ')' )
55-
56- print ('%{' )
57- print (' private static final Map<String,String> upperCaseVariantsAccepted' )
58- print (' = new HashMap<>();' )
59- print (' static {' )
60- print (' upperCaseVariantsAccepted.put("quot", "QUOT");' )
61- print (' upperCaseVariantsAccepted.put("copy", "COPY");' )
62- print (' upperCaseVariantsAccepted.put("gt", "GT");' )
63- print (' upperCaseVariantsAccepted.put("lt", "LT");' )
64- print (' upperCaseVariantsAccepted.put("reg", "REG");' )
65- print (' upperCaseVariantsAccepted.put("amp", "AMP");' )
66- print (' }' )
67- print (' private static final CharArrayMap<Character> entityValues' )
68- print (' = new CharArrayMap<>(%i, false);' % len (keys ))
69- print (' static {' )
70- print (' String[] entities = {' )
71- output_line = ' '
72- for key in keys :
73- new_entry = ' "%s", "%s",' % (key , codes [key ])
74- if len (output_line ) + len (new_entry ) >= 80 :
75- print (output_line )
76- output_line = ' '
77- output_line += new_entry
78- print (output_line [:- 1 ])
79- print (' };' )
80- print (' for (int i = 0 ; i < entities.length ; i += 2) {' )
81- print (' Character value = entities[i + 1].charAt(0);' )
82- print (' entityValues.put(entities[i], value);' )
83- print (' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' )
84- print (' if (upperCaseVariant != null) {' )
85- print (' entityValues.put(upperCaseVariant, value);' )
86- print (' }' )
87- print (' }' )
88- print (" }" )
89- print ("%}" )
24+ with open (sys .argv [1 ], "w" ) as f :
25+ sys .stdout = f
26+
27+ print (get_apache_license ())
28+ codes = {}
29+ regex = re .compile (r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"' )
30+ for line in get_entity_text ().split ("\n " ):
31+ match = regex .match (line )
32+ if match :
33+ key = match .group (1 )
34+ if key == "quot" :
35+ codes [key ] = r"\""
36+ elif key == "nbsp" :
37+ codes [key ] = " "
38+ else :
39+ codes [key ] = r"\u%04X" % int (match .group (2 ))
40+
41+ keys = sorted (codes )
42+
43+ first_entry = True
44+ output_line = "CharacterEntities = ( "
45+ for key in keys :
46+ new_entry = ('"%s"' if first_entry else ' | "%s"' ) % key
47+ first_entry = False
48+ if len (output_line ) + len (new_entry ) >= 80 :
49+ print (output_line )
50+ output_line = " "
51+ output_line += new_entry
52+ if key in ("quot" , "copy" , "gt" , "lt" , "reg" , "amp" ):
53+ new_entry = ' | "%s"' % key .upper ()
54+ if len (output_line ) + len (new_entry ) >= 80 :
55+ print (output_line )
56+ output_line = " "
57+ output_line += new_entry
58+ print (output_line , ")" )
59+
60+ print ("%{" )
61+ print (" private static final Map<String,String> upperCaseVariantsAccepted" )
62+ print (" = new HashMap<>();" )
63+ print (" static {" )
64+ print (' upperCaseVariantsAccepted.put("quot", "QUOT");' )
65+ print (' upperCaseVariantsAccepted.put("copy", "COPY");' )
66+ print (' upperCaseVariantsAccepted.put("gt", "GT");' )
67+ print (' upperCaseVariantsAccepted.put("lt", "LT");' )
68+ print (' upperCaseVariantsAccepted.put("reg", "REG");' )
69+ print (' upperCaseVariantsAccepted.put("amp", "AMP");' )
70+ print (" }" )
71+ print (" private static final CharArrayMap<Character> entityValues" )
72+ print (" = new CharArrayMap<>(%i, false);" % len (keys ))
73+ print (" static {" )
74+ print (" String[] entities = {" )
75+ output_line = " "
76+ for key in keys :
77+ new_entry = ' "%s", "%s",' % (key , codes [key ])
78+ if len (output_line ) + len (new_entry ) >= 80 :
79+ print (output_line )
80+ output_line = " "
81+ output_line += new_entry
82+ print (output_line [:- 1 ])
83+ print (" };" )
84+ print (" for (int i = 0 ; i < entities.length ; i += 2) {" )
85+ print (" Character value = entities[i + 1].charAt(0);" )
86+ print (" entityValues.put(entities[i], value);" )
87+ print (
88+ " String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);"
89+ )
90+ print (" if (upperCaseVariant != null) {" )
91+ print (" entityValues.put(upperCaseVariant, value);" )
92+ print (" }" )
93+ print (" }" )
94+ print (" }" )
95+ print ("%}" )
96+
9097
9198def get_entity_text ():
92- # The text below is taken verbatim from
93- # <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
94- text = r"""
99+ # The text below is taken verbatim from
100+ # <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
101+ text = r"""
95102F.1. XHTML Character Entities
96103
97104XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
@@ -517,10 +524,11 @@ def get_entity_text():
517524
518525<!-- end of xhtml-symbol.ent -->
519526"""
520- return text
527+ return text
528+
521529
522530def get_apache_license ():
523- license = r"""/*
531+ license = r"""/*
524532 * Licensed to the Apache Software Foundation (ASF) under one or more
525533 * contributor license agreements. See the NOTICE file distributed with
526534 * this work for additional information regarding copyright ownership.
@@ -538,6 +546,7 @@ def get_apache_license():
538546 */
539547
540548"""
541- return license
549+ return license
550+
542551
543552main ()
0 commit comments