Skip to content

Commit 31936ab

Browse files
anderskAlir3z4
andauthored
Fix invalid character reference parsing (#429)
https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code Fixes #310. Signed-off-by: Anders Kaseorg <andersk@mit.edu> Co-authored-by: Alireza Savand <alireza.savand@gmail.com>
1 parent 24e6b2f commit 31936ab

File tree

4 files changed

+42
-5
lines changed

4 files changed

+42
-5
lines changed

html2text/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from ._typing import OutCallback
1313
from .elements import AnchorElement, ListElement
1414
from .utils import (
15+
control_character_replacements,
1516
dumb_css_parser,
1617
element_style,
1718
escape_md,
@@ -917,13 +918,14 @@ def charref(self, name: str) -> str:
917918
else:
918919
c = int(name)
919920

921+
if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate
922+
c = 0xFFFD # REPLACEMENT CHARACTER
923+
c = control_character_replacements.get(c, c)
924+
920925
if not self.unicode_snob and c in unifiable_n:
921926
return unifiable_n[c]
922927
else:
923-
try:
924-
return chr(c)
925-
except ValueError: # invalid unicode
926-
return ""
928+
return chr(c)
927929

928930
def entityref(self, c: str) -> str:
929931
if not self.unicode_snob and c in config.UNIFIABLE:

html2text/utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,37 @@
99
if k != "nbsp"
1010
}
1111

12+
# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code
13+
control_character_replacements = {
14+
0x80: 0x20AC, # EURO SIGN (€)
15+
0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚)
16+
0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ)
17+
0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („)
18+
0x85: 0x2026, # HORIZONTAL ELLIPSIS (…)
19+
0x86: 0x2020, # DAGGER (†)
20+
0x87: 0x2021, # DOUBLE DAGGER (‡)
21+
0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
22+
0x89: 0x2030, # PER MILLE SIGN (‰)
23+
0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š)
24+
0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
25+
0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ)
26+
0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž)
27+
0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘)
28+
0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’)
29+
0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“)
30+
0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”)
31+
0x95: 0x2022, # BULLET (•)
32+
0x96: 0x2013, # EN DASH (–)
33+
0x97: 0x2014, # EM DASH (—)
34+
0x98: 0x02DC, # SMALL TILDE (˜)
35+
0x99: 0x2122, # TRADE MARK SIGN (™)
36+
0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š)
37+
0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
38+
0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ)
39+
0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž)
40+
0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
41+
}
42+
1243

1344
def hn(tag: str) -> int:
1445
if tag[0] == "h" and len(tag) == 2:

test/invalid_unicode.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
11
B&#3291685;r
2+
3+
&#x80;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8e;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9e;&#x9f;
4+
5+
&#0;&#xd800;&#xdfff;&#x110000;

test/invalid_unicode.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
Br
1+
B�r €‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ����

0 commit comments

Comments
 (0)