Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit 7ae54b3

Browse files
authored
Merge pull request #667 from matrix-org/dbkr/fix_markdown_spurious_html
Fix spurious HTML tags being passed through literally
2 parents e3bd522 + 853c89d commit 7ae54b3

File tree

1 file changed

+95
-62
lines changed

1 file changed

+95
-62
lines changed

src/Markdown.js

Lines changed: 95 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -15,110 +15,143 @@ limitations under the License.
1515
*/
1616

1717
import commonmark from 'commonmark';
18+
import escape from 'lodash/escape';
19+
20+
const ALLOWED_HTML_TAGS = ['del'];
21+
22+
// These types of node are definitely text
23+
const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];
24+
25+
function is_allowed_html_tag(node) {
26+
// Regex won't work for tags with attrs, but we only
27+
// allow <del> anyway.
28+
const matches = /^<\/?(.*)>$/.exec(node.literal);
29+
if (matches && matches.length == 2) {
30+
const tag = matches[1];
31+
return ALLOWED_HTML_TAGS.indexOf(tag) > -1;
32+
}
33+
return false;
34+
}
35+
36+
function html_if_tag_allowed(node) {
37+
if (is_allowed_html_tag(node)) {
38+
this.lit(node.literal);
39+
return;
40+
} else {
41+
this.lit(escape(node.literal));
42+
}
43+
}
44+
45+
/*
46+
* Returns true if the parse output containing the node
47+
* comprises multiple block level elements (ie. lines),
48+
* or false if it is only a single line.
49+
*/
50+
function is_multi_line(node) {
51+
var par = node;
52+
while (par.parent) {
53+
par = par.parent;
54+
}
55+
return par.firstChild != par.lastChild;
56+
}
1857

1958
/**
20-
* Class that wraps marked, adding the ability to see whether
59+
* Class that wraps commonmark, adding the ability to see whether
2160
* a given message actually uses any markdown syntax or whether
2261
* it's plain text.
2362
*/
2463
export default class Markdown {
2564
constructor(input) {
2665
this.input = input;
27-
this.parser = new commonmark.Parser();
28-
this.renderer = new commonmark.HtmlRenderer({safe: false});
66+
67+
const parser = new commonmark.Parser();
68+
this.parsed = parser.parse(this.input);
2969
}
3070

3171
isPlainText() {
32-
// we determine if the message requires markdown by
33-
// running the parser on the tokens with a dummy
34-
// rendered and seeing if any of the renderer's
35-
// functions are called other than those noted below.
36-
// In case you were wondering, no we can't just examine
37-
// the tokens because the tokens we have are only the
38-
// output of the *first* tokenizer: any line-based
39-
// markdown is processed by marked within Parser by
40-
// the 'inline lexer'...
41-
let is_plain = true;
42-
43-
function setNotPlain() {
44-
is_plain = false;
45-
}
46-
47-
const dummy_renderer = new commonmark.HtmlRenderer();
48-
for (const k of Object.keys(commonmark.HtmlRenderer.prototype)) {
49-
dummy_renderer[k] = setNotPlain;
72+
const walker = this.parsed.walker();
73+
74+
let ev;
75+
while ( (ev = walker.next()) ) {
76+
const node = ev.node;
77+
if (TEXT_NODES.indexOf(node.type) > -1) {
78+
// definitely text
79+
continue;
80+
} else if (node.type == 'html_inline' || node.type == 'html_block') {
81+
// if it's an allowed html tag, we need to render it and therefore
82+
// we will need to use HTML. If it's not allowed, it's not HTML since
83+
// we'll just be treating it as text.
84+
if (is_allowed_html_tag(node)) {
85+
return false;
86+
}
87+
} else {
88+
return false;
89+
}
5090
}
51-
// text and paragraph are just text
52-
dummy_renderer.text = function(t) { return t; };
53-
dummy_renderer.softbreak = function(t) { return t; };
54-
dummy_renderer.paragraph = function(t) { return t; };
55-
56-
const dummy_parser = new commonmark.Parser();
57-
dummy_renderer.render(dummy_parser.parse(this.input));
58-
59-
return is_plain;
91+
return true;
6092
}
6193

6294
toHTML() {
63-
const real_paragraph = this.renderer.paragraph;
95+
const renderer = new commonmark.HtmlRenderer({safe: false});
96+
const real_paragraph = renderer.paragraph;
6497

65-
this.renderer.paragraph = function(node, entering) {
98+
renderer.paragraph = function(node, entering) {
6699
// If there is only one top level node, just return the
67100
// bare text: it's a single line of text and so should be
68101
// 'inline', rather than unnecessarily wrapped in its own
69102
// p tag. If, however, we have multiple nodes, each gets
70103
// its own p tag to keep them as separate paragraphs.
71-
var par = node;
72-
while (par.parent) {
73-
par = par.parent;
74-
}
75-
if (par.firstChild != par.lastChild) {
104+
if (is_multi_line(node)) {
76105
real_paragraph.call(this, node, entering);
77106
}
78107
};
79108

80-
var parsed = this.parser.parse(this.input);
81-
var rendered = this.renderer.render(parsed);
109+
renderer.html_inline = html_if_tag_allowed;
110+
renderer.html_block = function(node) {
111+
// as with `paragraph`, we only insert line breaks
112+
// if there are multiple lines in the markdown.
113+
const isMultiLine = is_multi_line(node);
82114

83-
this.renderer.paragraph = real_paragraph;
115+
if (isMultiLine) this.cr();
116+
html_if_tag_allowed.call(this, node);
117+
if (isMultiLine) this.cr();
118+
}
84119

85-
return rendered;
120+
return renderer.render(this.parsed);
86121
}
87122

123+
/*
124+
* Render the markdown message to plain text. That is, essentially
125+
* just remove any backslashes escaping what would otherwise be
126+
* markdown syntax
127+
* (to fix https://github.com/vector-im/riot-web/issues/2870)
128+
*/
88129
toPlaintext() {
89-
const real_paragraph = this.renderer.paragraph;
130+
const renderer = new commonmark.HtmlRenderer({safe: false});
131+
const real_paragraph = renderer.paragraph;
90132

91133
// The default `out` function only sends the input through an XML
92134
// escaping function, which causes messages to be entity encoded,
93135
// which we don't want in this case.
94-
this.renderer.out = function(s) {
136+
renderer.out = function(s) {
95137
// The `lit` function adds a string literal to the output buffer.
96138
this.lit(s);
97139
};
98140

99-
this.renderer.paragraph = function(node, entering) {
100-
// If there is only one top level node, just return the
101-
// bare text: it's a single line of text and so should be
102-
// 'inline', rather than unnecessarily wrapped in its own
103-
// p tag. If, however, we have multiple nodes, each gets
104-
// its own p tag to keep them as separate paragraphs.
105-
var par = node;
106-
while (par.parent) {
107-
node = par;
108-
par = par.parent;
109-
}
110-
if (node != par.lastChild) {
111-
if (!entering) {
141+
renderer.paragraph = function(node, entering) {
142+
// as with toHTML, only append lines to paragraphs if there are
143+
// multiple paragraphs
144+
if (is_multi_line(node)) {
145+
if (!entering && node.next) {
112146
this.lit('\n\n');
113147
}
114148
}
115149
};
150+
renderer.html_block = function(node) {
151+
this.lit(node.literal);
152+
if (is_multi_line(node) && node.next) this.lit('\n\n');
153+
}
116154

117-
var parsed = this.parser.parse(this.input);
118-
var rendered = this.renderer.render(parsed);
119-
120-
this.renderer.paragraph = real_paragraph;
121-
122-
return rendered;
155+
return renderer.render(this.parsed);
123156
}
124157
}

0 commit comments

Comments
 (0)