Skip to content

Commit 44949f7

Browse files
authored
Merge pull request #2 from webfactory/workaround-libxml_2_9_2-regression
Work around a (suspected) regression in libxml2 2.9.1 -> 2.9.2
2 parents 123f745 + 6b91086 commit 44949f7

File tree

5 files changed

+1511
-2
lines changed

5 files changed

+1511
-2
lines changed

src/Webfactory/Dom/XHTML10ParsingHelper.php

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,26 @@
88

99
namespace Webfactory\Dom;
1010

11-
class XHTML10ParsingHelper extends HTMLParsingHelper {
11+
class XHTML10ParsingHelper extends HTMLParsingHelper
12+
{
13+
public function __construct()
14+
{
15+
libxml_set_external_entity_loader(function ($public, $system, $context) {
16+
$catalogDir = __DIR__ . '/../../../xml-catalog/';
17+
switch ($public) {
18+
case '-//W3C//DTD XHTML 1.0 Strict//EN':
19+
return $catalogDir . 'xhtml1-strict.dtd';
20+
case '-//W3C//ENTITIES Latin 1 for XHTML//EN':
21+
return $catalogDir . 'xhtml-lat1.ent';
22+
case '-//W3C//ENTITIES Symbols for XHTML//EN':
23+
return $catalogDir . 'xhtml-symbol.ent';
24+
case '-//W3C//ENTITIES Special for XHTML//EN':
25+
return $catalogDir . 'xhtml-special.ent';
26+
default:
27+
return $system;
28+
}
29+
});
30+
}
1231

1332
protected function wrapFragment($fragment, $declaredNamespaces)
1433
{
@@ -19,5 +38,4 @@ protected function wrapFragment($fragment, $declaredNamespaces)
1938
$declaredNamespaces
2039
);
2140
}
22-
2341
}

xml-catalog/xhtml-lat1.ent

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
<!-- Portions (C) International Organization for Standardization 1986
2+
Permission to copy in any form is granted for use with
3+
conforming SGML systems and applications as defined in
4+
ISO 8879, provided this notice is included in all copies.
5+
-->
6+
<!-- Character entity set. Typical invocation:
7+
<!ENTITY % HTMLlat1 PUBLIC
8+
"-//W3C//ENTITIES Latin 1 for XHTML//EN"
9+
"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">
10+
%HTMLlat1;
11+
-->
12+
13+
<!ENTITY nbsp "&#160;"> <!-- no-break space = non-breaking space,
14+
U+00A0 ISOnum -->
15+
<!ENTITY iexcl "&#161;"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
16+
<!ENTITY cent "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
17+
<!ENTITY pound "&#163;"> <!-- pound sign, U+00A3 ISOnum -->
18+
<!ENTITY curren "&#164;"> <!-- currency sign, U+00A4 ISOnum -->
19+
<!ENTITY yen "&#165;"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
20+
<!ENTITY brvbar "&#166;"> <!-- broken bar = broken vertical bar,
21+
U+00A6 ISOnum -->
22+
<!ENTITY sect "&#167;"> <!-- section sign, U+00A7 ISOnum -->
23+
<!ENTITY uml "&#168;"> <!-- diaeresis = spacing diaeresis,
24+
U+00A8 ISOdia -->
25+
<!ENTITY copy "&#169;"> <!-- copyright sign, U+00A9 ISOnum -->
26+
<!ENTITY ordf "&#170;"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
27+
<!ENTITY laquo "&#171;"> <!-- left-pointing double angle quotation mark
28+
= left pointing guillemet, U+00AB ISOnum -->
29+
<!ENTITY not "&#172;"> <!-- not sign = angled dash,
30+
U+00AC ISOnum -->
31+
<!ENTITY shy "&#173;"> <!-- soft hyphen = discretionary hyphen,
32+
U+00AD ISOnum -->
33+
<!ENTITY reg "&#174;"> <!-- registered sign = registered trade mark sign,
34+
U+00AE ISOnum -->
35+
<!ENTITY macr "&#175;"> <!-- macron = spacing macron = overline
36+
= APL overbar, U+00AF ISOdia -->
37+
<!ENTITY deg "&#176;"> <!-- degree sign, U+00B0 ISOnum -->
38+
<!ENTITY plusmn "&#177;"> <!-- plus-minus sign = plus-or-minus sign,
39+
U+00B1 ISOnum -->
40+
<!ENTITY sup2 "&#178;"> <!-- superscript two = superscript digit two
41+
= squared, U+00B2 ISOnum -->
42+
<!ENTITY sup3 "&#179;"> <!-- superscript three = superscript digit three
43+
= cubed, U+00B3 ISOnum -->
44+
<!ENTITY acute "&#180;"> <!-- acute accent = spacing acute,
45+
U+00B4 ISOdia -->
46+
<!ENTITY micro "&#181;"> <!-- micro sign, U+00B5 ISOnum -->
47+
<!ENTITY para "&#182;"> <!-- pilcrow sign = paragraph sign,
48+
U+00B6 ISOnum -->
49+
<!ENTITY middot "&#183;"> <!-- middle dot = Georgian comma
50+
= Greek middle dot, U+00B7 ISOnum -->
51+
<!ENTITY cedil "&#184;"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
52+
<!ENTITY sup1 "&#185;"> <!-- superscript one = superscript digit one,
53+
U+00B9 ISOnum -->
54+
<!ENTITY ordm "&#186;"> <!-- masculine ordinal indicator,
55+
U+00BA ISOnum -->
56+
<!ENTITY raquo "&#187;"> <!-- right-pointing double angle quotation mark
57+
= right pointing guillemet, U+00BB ISOnum -->
58+
<!ENTITY frac14 "&#188;"> <!-- vulgar fraction one quarter
59+
= fraction one quarter, U+00BC ISOnum -->
60+
<!ENTITY frac12 "&#189;"> <!-- vulgar fraction one half
61+
= fraction one half, U+00BD ISOnum -->
62+
<!ENTITY frac34 "&#190;"> <!-- vulgar fraction three quarters
63+
= fraction three quarters, U+00BE ISOnum -->
64+
<!ENTITY iquest "&#191;"> <!-- inverted question mark
65+
= turned question mark, U+00BF ISOnum -->
66+
<!ENTITY Agrave "&#192;"> <!-- latin capital letter A with grave
67+
= latin capital letter A grave,
68+
U+00C0 ISOlat1 -->
69+
<!ENTITY Aacute "&#193;"> <!-- latin capital letter A with acute,
70+
U+00C1 ISOlat1 -->
71+
<!ENTITY Acirc "&#194;"> <!-- latin capital letter A with circumflex,
72+
U+00C2 ISOlat1 -->
73+
<!ENTITY Atilde "&#195;"> <!-- latin capital letter A with tilde,
74+
U+00C3 ISOlat1 -->
75+
<!ENTITY Auml "&#196;"> <!-- latin capital letter A with diaeresis,
76+
U+00C4 ISOlat1 -->
77+
<!ENTITY Aring "&#197;"> <!-- latin capital letter A with ring above
78+
= latin capital letter A ring,
79+
U+00C5 ISOlat1 -->
80+
<!ENTITY AElig "&#198;"> <!-- latin capital letter AE
81+
= latin capital ligature AE,
82+
U+00C6 ISOlat1 -->
83+
<!ENTITY Ccedil "&#199;"> <!-- latin capital letter C with cedilla,
84+
U+00C7 ISOlat1 -->
85+
<!ENTITY Egrave "&#200;"> <!-- latin capital letter E with grave,
86+
U+00C8 ISOlat1 -->
87+
<!ENTITY Eacute "&#201;"> <!-- latin capital letter E with acute,
88+
U+00C9 ISOlat1 -->
89+
<!ENTITY Ecirc "&#202;"> <!-- latin capital letter E with circumflex,
90+
U+00CA ISOlat1 -->
91+
<!ENTITY Euml "&#203;"> <!-- latin capital letter E with diaeresis,
92+
U+00CB ISOlat1 -->
93+
<!ENTITY Igrave "&#204;"> <!-- latin capital letter I with grave,
94+
U+00CC ISOlat1 -->
95+
<!ENTITY Iacute "&#205;"> <!-- latin capital letter I with acute,
96+
U+00CD ISOlat1 -->
97+
<!ENTITY Icirc "&#206;"> <!-- latin capital letter I with circumflex,
98+
U+00CE ISOlat1 -->
99+
<!ENTITY Iuml "&#207;"> <!-- latin capital letter I with diaeresis,
100+
U+00CF ISOlat1 -->
101+
<!ENTITY ETH "&#208;"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
102+
<!ENTITY Ntilde "&#209;"> <!-- latin capital letter N with tilde,
103+
U+00D1 ISOlat1 -->
104+
<!ENTITY Ograve "&#210;"> <!-- latin capital letter O with grave,
105+
U+00D2 ISOlat1 -->
106+
<!ENTITY Oacute "&#211;"> <!-- latin capital letter O with acute,
107+
U+00D3 ISOlat1 -->
108+
<!ENTITY Ocirc "&#212;"> <!-- latin capital letter O with circumflex,
109+
U+00D4 ISOlat1 -->
110+
<!ENTITY Otilde "&#213;"> <!-- latin capital letter O with tilde,
111+
U+00D5 ISOlat1 -->
112+
<!ENTITY Ouml "&#214;"> <!-- latin capital letter O with diaeresis,
113+
U+00D6 ISOlat1 -->
114+
<!ENTITY times "&#215;"> <!-- multiplication sign, U+00D7 ISOnum -->
115+
<!ENTITY Oslash "&#216;"> <!-- latin capital letter O with stroke
116+
= latin capital letter O slash,
117+
U+00D8 ISOlat1 -->
118+
<!ENTITY Ugrave "&#217;"> <!-- latin capital letter U with grave,
119+
U+00D9 ISOlat1 -->
120+
<!ENTITY Uacute "&#218;"> <!-- latin capital letter U with acute,
121+
U+00DA ISOlat1 -->
122+
<!ENTITY Ucirc "&#219;"> <!-- latin capital letter U with circumflex,
123+
U+00DB ISOlat1 -->
124+
<!ENTITY Uuml "&#220;"> <!-- latin capital letter U with diaeresis,
125+
U+00DC ISOlat1 -->
126+
<!ENTITY Yacute "&#221;"> <!-- latin capital letter Y with acute,
127+
U+00DD ISOlat1 -->
128+
<!ENTITY THORN "&#222;"> <!-- latin capital letter THORN,
129+
U+00DE ISOlat1 -->
130+
<!ENTITY szlig "&#223;"> <!-- latin small letter sharp s = ess-zed,
131+
U+00DF ISOlat1 -->
132+
<!ENTITY agrave "&#224;"> <!-- latin small letter a with grave
133+
= latin small letter a grave,
134+
U+00E0 ISOlat1 -->
135+
<!ENTITY aacute "&#225;"> <!-- latin small letter a with acute,
136+
U+00E1 ISOlat1 -->
137+
<!ENTITY acirc "&#226;"> <!-- latin small letter a with circumflex,
138+
U+00E2 ISOlat1 -->
139+
<!ENTITY atilde "&#227;"> <!-- latin small letter a with tilde,
140+
U+00E3 ISOlat1 -->
141+
<!ENTITY auml "&#228;"> <!-- latin small letter a with diaeresis,
142+
U+00E4 ISOlat1 -->
143+
<!ENTITY aring "&#229;"> <!-- latin small letter a with ring above
144+
= latin small letter a ring,
145+
U+00E5 ISOlat1 -->
146+
<!ENTITY aelig "&#230;"> <!-- latin small letter ae
147+
= latin small ligature ae, U+00E6 ISOlat1 -->
148+
<!ENTITY ccedil "&#231;"> <!-- latin small letter c with cedilla,
149+
U+00E7 ISOlat1 -->
150+
<!ENTITY egrave "&#232;"> <!-- latin small letter e with grave,
151+
U+00E8 ISOlat1 -->
152+
<!ENTITY eacute "&#233;"> <!-- latin small letter e with acute,
153+
U+00E9 ISOlat1 -->
154+
<!ENTITY ecirc "&#234;"> <!-- latin small letter e with circumflex,
155+
U+00EA ISOlat1 -->
156+
<!ENTITY euml "&#235;"> <!-- latin small letter e with diaeresis,
157+
U+00EB ISOlat1 -->
158+
<!ENTITY igrave "&#236;"> <!-- latin small letter i with grave,
159+
U+00EC ISOlat1 -->
160+
<!ENTITY iacute "&#237;"> <!-- latin small letter i with acute,
161+
U+00ED ISOlat1 -->
162+
<!ENTITY icirc "&#238;"> <!-- latin small letter i with circumflex,
163+
U+00EE ISOlat1 -->
164+
<!ENTITY iuml "&#239;"> <!-- latin small letter i with diaeresis,
165+
U+00EF ISOlat1 -->
166+
<!ENTITY eth "&#240;"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
167+
<!ENTITY ntilde "&#241;"> <!-- latin small letter n with tilde,
168+
U+00F1 ISOlat1 -->
169+
<!ENTITY ograve "&#242;"> <!-- latin small letter o with grave,
170+
U+00F2 ISOlat1 -->
171+
<!ENTITY oacute "&#243;"> <!-- latin small letter o with acute,
172+
U+00F3 ISOlat1 -->
173+
<!ENTITY ocirc "&#244;"> <!-- latin small letter o with circumflex,
174+
U+00F4 ISOlat1 -->
175+
<!ENTITY otilde "&#245;"> <!-- latin small letter o with tilde,
176+
U+00F5 ISOlat1 -->
177+
<!ENTITY ouml "&#246;"> <!-- latin small letter o with diaeresis,
178+
U+00F6 ISOlat1 -->
179+
<!ENTITY divide "&#247;"> <!-- division sign, U+00F7 ISOnum -->
180+
<!ENTITY oslash "&#248;"> <!-- latin small letter o with stroke,
181+
= latin small letter o slash,
182+
U+00F8 ISOlat1 -->
183+
<!ENTITY ugrave "&#249;"> <!-- latin small letter u with grave,
184+
U+00F9 ISOlat1 -->
185+
<!ENTITY uacute "&#250;"> <!-- latin small letter u with acute,
186+
U+00FA ISOlat1 -->
187+
<!ENTITY ucirc "&#251;"> <!-- latin small letter u with circumflex,
188+
U+00FB ISOlat1 -->
189+
<!ENTITY uuml "&#252;"> <!-- latin small letter u with diaeresis,
190+
U+00FC ISOlat1 -->
191+
<!ENTITY yacute "&#253;"> <!-- latin small letter y with acute,
192+
U+00FD ISOlat1 -->
193+
<!ENTITY thorn "&#254;"> <!-- latin small letter thorn,
194+
U+00FE ISOlat1 -->
195+
<!ENTITY yuml "&#255;"> <!-- latin small letter y with diaeresis,
196+
U+00FF ISOlat1 -->

xml-catalog/xhtml-special.ent

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
<!-- Special characters for XHTML -->
2+
3+
<!-- Character entity set. Typical invocation:
4+
<!ENTITY % HTMLspecial PUBLIC
5+
"-//W3C//ENTITIES Special for XHTML//EN"
6+
"http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
7+
%HTMLspecial;
8+
-->
9+
10+
<!-- Portions (C) International Organization for Standardization 1986:
11+
Permission to copy in any form is granted for use with
12+
conforming SGML systems and applications as defined in
13+
ISO 8879, provided this notice is included in all copies.
14+
-->
15+
16+
<!-- Relevant ISO entity set is given unless names are newly introduced.
17+
New names (i.e., not in ISO 8879 list) do not clash with any
18+
existing ISO 8879 entity names. ISO 10646 character numbers
19+
are given for each character, in hex. values are decimal
20+
conversions of the ISO 10646 values and refer to the document
21+
character set. Names are Unicode names.
22+
-->
23+
24+
<!-- C0 Controls and Basic Latin -->
25+
<!ENTITY quot "&#34;"> <!-- quotation mark, U+0022 ISOnum -->
26+
<!ENTITY amp "&#38;#38;"> <!-- ampersand, U+0026 ISOnum -->
27+
<!ENTITY lt "&#38;#60;"> <!-- less-than sign, U+003C ISOnum -->
28+
<!ENTITY gt "&#62;"> <!-- greater-than sign, U+003E ISOnum -->
29+
<!ENTITY apos "&#39;"> <!-- apostrophe = APL quote, U+0027 ISOnum -->
30+
31+
<!-- Latin Extended-A -->
32+
<!ENTITY OElig "&#338;"> <!-- latin capital ligature OE,
33+
U+0152 ISOlat2 -->
34+
<!ENTITY oelig "&#339;"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
35+
<!-- ligature is a misnomer, this is a separate character in some languages -->
36+
<!ENTITY Scaron "&#352;"> <!-- latin capital letter S with caron,
37+
U+0160 ISOlat2 -->
38+
<!ENTITY scaron "&#353;"> <!-- latin small letter s with caron,
39+
U+0161 ISOlat2 -->
40+
<!ENTITY Yuml "&#376;"> <!-- latin capital letter Y with diaeresis,
41+
U+0178 ISOlat2 -->
42+
43+
<!-- Spacing Modifier Letters -->
44+
<!ENTITY circ "&#710;"> <!-- modifier letter circumflex accent,
45+
U+02C6 ISOpub -->
46+
<!ENTITY tilde "&#732;"> <!-- small tilde, U+02DC ISOdia -->
47+
48+
<!-- General Punctuation -->
49+
<!ENTITY ensp "&#8194;"> <!-- en space, U+2002 ISOpub -->
50+
<!ENTITY emsp "&#8195;"> <!-- em space, U+2003 ISOpub -->
51+
<!ENTITY thinsp "&#8201;"> <!-- thin space, U+2009 ISOpub -->
52+
<!ENTITY zwnj "&#8204;"> <!-- zero width non-joiner,
53+
U+200C NEW RFC 2070 -->
54+
<!ENTITY zwj "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
55+
<!ENTITY lrm "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
56+
<!ENTITY rlm "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
57+
<!ENTITY ndash "&#8211;"> <!-- en dash, U+2013 ISOpub -->
58+
<!ENTITY mdash "&#8212;"> <!-- em dash, U+2014 ISOpub -->
59+
<!ENTITY lsquo "&#8216;"> <!-- left single quotation mark,
60+
U+2018 ISOnum -->
61+
<!ENTITY rsquo "&#8217;"> <!-- right single quotation mark,
62+
U+2019 ISOnum -->
63+
<!ENTITY sbquo "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
64+
<!ENTITY ldquo "&#8220;"> <!-- left double quotation mark,
65+
U+201C ISOnum -->
66+
<!ENTITY rdquo "&#8221;"> <!-- right double quotation mark,
67+
U+201D ISOnum -->
68+
<!ENTITY bdquo "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
69+
<!ENTITY dagger "&#8224;"> <!-- dagger, U+2020 ISOpub -->
70+
<!ENTITY Dagger "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
71+
<!ENTITY permil "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
72+
<!ENTITY lsaquo "&#8249;"> <!-- single left-pointing angle quotation mark,
73+
U+2039 ISO proposed -->
74+
<!-- lsaquo is proposed but not yet ISO standardized -->
75+
<!ENTITY rsaquo "&#8250;"> <!-- single right-pointing angle quotation mark,
76+
U+203A ISO proposed -->
77+
<!-- rsaquo is proposed but not yet ISO standardized -->
78+
79+
<!-- Currency Symbols -->
80+
<!ENTITY euro "&#8364;"> <!-- euro sign, U+20AC NEW -->

0 commit comments

Comments
 (0)