Skip to content

Commit 52b24c2

Browse files
authored
Implement escapes in Markdown to RDoc conversion (#1575)
Fixes #919 Plain text part of parsed markdown may contain special characters (example: `+_*<`). URL in tidy link may contain `[]`. These characters need escape. `{}[]` also needs escape, but not escaped in this pull request. RDoc-style tidylink in markdown is still available for now.
1 parent 74e054c commit 52b24c2

File tree

6 files changed

+136
-42
lines changed

6 files changed

+136
-42
lines changed

lib/rdoc/markdown.kpeg

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,20 @@
303303
end
304304
end
305305

306+
# Escape character that has special meaning in RDoc format.
307+
# To allow rdoc-styled link used in markdown format for now, bracket and brace are not escaped.
308+
309+
def rdoc_escape(text)
310+
text.gsub(/[*+<\\_]/) {|s| "\\#{s}" }
311+
end
312+
313+
# Escape link url that contains brackets.
314+
# Brackets needs escape because link url will be surrounded by `[]` in RDoc format.
315+
316+
def rdoc_link_url_escape(text)
317+
text.gsub(/[\[\]\\]/) {|s| "\\#{s}" }
318+
end
319+
306320
##
307321
# :category: Extensions
308322
#
@@ -969,11 +983,11 @@ Space = @Spacechar+ { " " }
969983

970984
Str = @StartList:a
971985
< @NormalChar+ > { a = text }
972-
( StrChunk:c { a << c } )* { a }
986+
( StrChunk:c { a << c } )* { rdoc_escape(a) }
973987

974988
StrChunk = < (@NormalChar | /_+/ &Alphanumeric)+ > { text }
975989

976-
EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { text }
990+
EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { rdoc_escape(text) }
977991

978992
Entity = ( HexEntity | DecEntity | CharEntity ):a { a }
979993

@@ -988,7 +1002,7 @@ TerminalEndline = @Sp @Newline @Eof
9881002
LineBreak = " " @NormalEndline { RDoc::Markup::HardBreak.new }
9891003

9901004
Symbol = < @SpecialChar >
991-
{ text }
1005+
{ rdoc_escape(text) }
9921006

9931007
# This keeps the parser from getting bogged down on long strings of '*' or '_',
9941008
# or strings of '*' or '_' with space on each side:
@@ -1053,7 +1067,7 @@ ReferenceLinkSingle = Label:content < (Spnl "[]")? >
10531067
{ link_to content, content, text }
10541068

10551069
ExplicitLink = ExplicitLinkWithLabel:a
1056-
{ "{#{a[:label]}}[#{a[:link]}]" }
1070+
{ "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" }
10571071

10581072
ExplicitLinkWithLabel = Label:label "(" @Sp Source:link Spnl Title @Sp ")"
10591073
{ { label: label, link: link } }
@@ -1163,12 +1177,12 @@ Newline = %literals.Newline
11631177
Spacechar = %literals.Spacechar
11641178

11651179
HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";"
1166-
{ [text.to_i(16)].pack 'U' }
1180+
{ rdoc_escape([text.to_i(16)].pack('U')) }
11671181
DecEntity = "&#" < /[0-9]+/ > ";"
1168-
{ [text.to_i].pack 'U' }
1182+
{ rdoc_escape([text.to_i].pack('U')) }
11691183
CharEntity = "&" </[A-Za-z0-9]+/ > ";"
11701184
{ if entity = HTML_ENTITIES[text] then
1171-
entity.pack 'U*'
1185+
rdoc_escape(entity.pack('U*'))
11721186
else
11731187
"&#{text};"
11741188
end

lib/rdoc/markdown.rb

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,20 @@ def emphasis text
688688
end
689689
end
690690

691+
# Escape character that has special meaning in RDoc format.
692+
# To allow rdoc-styled link used in markdown format for now, bracket and brace are not escaped.
693+
694+
def rdoc_escape(text)
695+
text.gsub(/[*+<\\_]/) {|s| "\\#{s}" }
696+
end
697+
698+
# Escape link url that contains brackets.
699+
# Brackets needs escape because link url will be surrounded by `[]` in RDoc format.
700+
701+
def rdoc_link_url_escape(text)
702+
text.gsub(/[\[\]\\]/) {|s| "\\#{s}" }
703+
end
704+
691705
##
692706
# :category: Extensions
693707
#
@@ -9731,7 +9745,7 @@ def _Space
97319745
return _tmp
97329746
end
97339747

9734-
# Str = @StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { a }
9748+
# Str = @StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { rdoc_escape(a) }
97359749
def _Str
97369750

97379751
_save = self.pos
@@ -9792,7 +9806,7 @@ def _Str
97929806
self.pos = _save
97939807
break
97949808
end
9795-
@result = begin; a ; end
9809+
@result = begin; rdoc_escape(a) ; end
97969810
_tmp = true
97979811
unless _tmp
97989812
self.pos = _save
@@ -9894,7 +9908,7 @@ def _StrChunk
98949908
return _tmp
98959909
end
98969910

9897-
# EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { text }
9911+
# EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { rdoc_escape(text) }
98989912
def _EscapedChar
98999913

99009914
_save = self.pos
@@ -9921,7 +9935,7 @@ def _EscapedChar
99219935
self.pos = _save
99229936
break
99239937
end
9924-
@result = begin; text ; end
9938+
@result = begin; rdoc_escape(text) ; end
99259939
_tmp = true
99269940
unless _tmp
99279941
self.pos = _save
@@ -10122,7 +10136,7 @@ def _LineBreak
1012210136
return _tmp
1012310137
end
1012410138

10125-
# Symbol = < @SpecialChar > { text }
10139+
# Symbol = < @SpecialChar > { rdoc_escape(text) }
1012610140
def _Symbol
1012710141

1012810142
_save = self.pos
@@ -10136,7 +10150,7 @@ def _Symbol
1013610150
self.pos = _save
1013710151
break
1013810152
end
10139-
@result = begin; text ; end
10153+
@result = begin; rdoc_escape(text) ; end
1014010154
_tmp = true
1014110155
unless _tmp
1014210156
self.pos = _save
@@ -11189,7 +11203,7 @@ def _ReferenceLinkSingle
1118911203
return _tmp
1119011204
end
1119111205

11192-
# ExplicitLink = ExplicitLinkWithLabel:a { "{#{a[:label]}}[#{a[:link]}]" }
11206+
# ExplicitLink = ExplicitLinkWithLabel:a { "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" }
1119311207
def _ExplicitLink
1119411208

1119511209
_save = self.pos
@@ -11200,7 +11214,7 @@ def _ExplicitLink
1120011214
self.pos = _save
1120111215
break
1120211216
end
11203-
@result = begin; "{#{a[:label]}}[#{a[:link]}]" ; end
11217+
@result = begin; "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" ; end
1120411218
_tmp = true
1120511219
unless _tmp
1120611220
self.pos = _save
@@ -14615,7 +14629,7 @@ def _Spacechar
1461514629
return _tmp
1461614630
end
1461714631

14618-
# HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";" { [text.to_i(16)].pack 'U' }
14632+
# HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";" { rdoc_escape([text.to_i(16)].pack('U')) }
1461914633
def _HexEntity
1462014634

1462114635
_save = self.pos
@@ -14639,7 +14653,7 @@ def _HexEntity
1463914653
self.pos = _save
1464014654
break
1464114655
end
14642-
@result = begin; [text.to_i(16)].pack 'U' ; end
14656+
@result = begin; rdoc_escape([text.to_i(16)].pack('U')) ; end
1464314657
_tmp = true
1464414658
unless _tmp
1464514659
self.pos = _save
@@ -14651,7 +14665,7 @@ def _HexEntity
1465114665
return _tmp
1465214666
end
1465314667

14654-
# DecEntity = "&#" < /[0-9]+/ > ";" { [text.to_i].pack 'U' }
14668+
# DecEntity = "&#" < /[0-9]+/ > ";" { rdoc_escape([text.to_i].pack('U')) }
1465514669
def _DecEntity
1465614670

1465714671
_save = self.pos
@@ -14675,7 +14689,7 @@ def _DecEntity
1467514689
self.pos = _save
1467614690
break
1467714691
end
14678-
@result = begin; [text.to_i].pack 'U' ; end
14692+
@result = begin; rdoc_escape([text.to_i].pack('U')) ; end
1467914693
_tmp = true
1468014694
unless _tmp
1468114695
self.pos = _save
@@ -14687,7 +14701,7 @@ def _DecEntity
1468714701
return _tmp
1468814702
end
1468914703

14690-
# CharEntity = "&" < /[A-Za-z0-9]+/ > ";" { if entity = HTML_ENTITIES[text] then entity.pack 'U*' else "&#{text};" end }
14704+
# CharEntity = "&" < /[A-Za-z0-9]+/ > ";" { if entity = HTML_ENTITIES[text] then rdoc_escape(entity.pack('U*')) else "&#{text};" end }
1469114705
def _CharEntity
1469214706

1469314707
_save = self.pos
@@ -14712,7 +14726,7 @@ def _CharEntity
1471214726
break
1471314727
end
1471414728
@result = begin; if entity = HTML_ENTITIES[text] then
14715-
entity.pack 'U*'
14729+
rdoc_escape(entity.pack('U*'))
1471614730
else
1471714731
"&#{text};"
1471814732
end
@@ -16563,15 +16577,15 @@ def _DefinitionListDefinition
1656316577
Rules[:_Inlines] = rule_info("Inlines", "(!@Endline Inline:i { i } | @Endline:c !(&{ github? } Ticks3 /[^`\\n]*$/) &Inline { c })+:chunks @Endline? { chunks }")
1656416578
Rules[:_Inline] = rule_info("Inline", "(Str | @Endline | UlOrStarLine | @Space | Strong | Emph | Strike | Image | Link | NoteReference | InlineNote | Code | RawHtml | Entity | EscapedChar | Symbol)")
1656516579
Rules[:_Space] = rule_info("Space", "@Spacechar+ { \" \" }")
16566-
Rules[:_Str] = rule_info("Str", "@StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { a }")
16580+
Rules[:_Str] = rule_info("Str", "@StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { rdoc_escape(a) }")
1656716581
Rules[:_StrChunk] = rule_info("StrChunk", "< (@NormalChar | /_+/ &Alphanumeric)+ > { text }")
16568-
Rules[:_EscapedChar] = rule_info("EscapedChar", "\"\\\\\" !@Newline < /[:\\\\`|*_{}\\[\\]()\#+.!><-]/ > { text }")
16582+
Rules[:_EscapedChar] = rule_info("EscapedChar", "\"\\\\\" !@Newline < /[:\\\\`|*_{}\\[\\]()\#+.!><-]/ > { rdoc_escape(text) }")
1656916583
Rules[:_Entity] = rule_info("Entity", "(HexEntity | DecEntity | CharEntity):a { a }")
1657016584
Rules[:_Endline] = rule_info("Endline", "(@LineBreak | @TerminalEndline | @NormalEndline)")
1657116585
Rules[:_NormalEndline] = rule_info("NormalEndline", "@Sp @Newline !@BlankLine !\">\" !AtxStart !(Line /={1,}|-{1,}/ @Newline) { \"\\n\" }")
1657216586
Rules[:_TerminalEndline] = rule_info("TerminalEndline", "@Sp @Newline @Eof")
1657316587
Rules[:_LineBreak] = rule_info("LineBreak", "\" \" @NormalEndline { RDoc::Markup::HardBreak.new }")
16574-
Rules[:_Symbol] = rule_info("Symbol", "< @SpecialChar > { text }")
16588+
Rules[:_Symbol] = rule_info("Symbol", "< @SpecialChar > { rdoc_escape(text) }")
1657516589
Rules[:_UlOrStarLine] = rule_info("UlOrStarLine", "(UlLine | StarLine):a { a }")
1657616590
Rules[:_StarLine] = rule_info("StarLine", "(< /\\*{4,}/ > { text } | < @Spacechar /\\*+/ &@Spacechar > { text })")
1657716591
Rules[:_UlLine] = rule_info("UlLine", "(< /_{4,}/ > { text } | < @Spacechar /_+/ &@Spacechar > { text })")
@@ -16588,7 +16602,7 @@ def _DefinitionListDefinition
1658816602
Rules[:_ReferenceLink] = rule_info("ReferenceLink", "(ReferenceLinkDouble | ReferenceLinkSingle)")
1658916603
Rules[:_ReferenceLinkDouble] = rule_info("ReferenceLinkDouble", "Label:content < Spnl > !\"[]\" Label:label { link_to content, label, text }")
1659016604
Rules[:_ReferenceLinkSingle] = rule_info("ReferenceLinkSingle", "Label:content < (Spnl \"[]\")? > { link_to content, content, text }")
16591-
Rules[:_ExplicitLink] = rule_info("ExplicitLink", "ExplicitLinkWithLabel:a { \"{\#{a[:label]}}[\#{a[:link]}]\" }")
16605+
Rules[:_ExplicitLink] = rule_info("ExplicitLink", "ExplicitLinkWithLabel:a { \"{\#{a[:label]}}[\#{rdoc_link_url_escape(a[:link])}]\" }")
1659216606
Rules[:_ExplicitLinkWithLabel] = rule_info("ExplicitLinkWithLabel", "Label:label \"(\" @Sp Source:link Spnl Title @Sp \")\" { { label: label, link: link } }")
1659316607
Rules[:_Source] = rule_info("Source", "(\"<\" < SourceContents > \">\" | < SourceContents >) { text }")
1659416608
Rules[:_SourceContents] = rule_info("SourceContents", "((!\"(\" !\")\" !\">\" Nonspacechar)+ | \"(\" SourceContents \")\")*")
@@ -16631,9 +16645,9 @@ def _DefinitionListDefinition
1663116645
Rules[:_BOM] = rule_info("BOM", "%literals.BOM")
1663216646
Rules[:_Newline] = rule_info("Newline", "%literals.Newline")
1663316647
Rules[:_Spacechar] = rule_info("Spacechar", "%literals.Spacechar")
16634-
Rules[:_HexEntity] = rule_info("HexEntity", "/&\#x/i < /[0-9a-fA-F]+/ > \";\" { [text.to_i(16)].pack 'U' }")
16635-
Rules[:_DecEntity] = rule_info("DecEntity", "\"&\#\" < /[0-9]+/ > \";\" { [text.to_i].pack 'U' }")
16636-
Rules[:_CharEntity] = rule_info("CharEntity", "\"&\" < /[A-Za-z0-9]+/ > \";\" { if entity = HTML_ENTITIES[text] then entity.pack 'U*' else \"&\#{text};\" end }")
16648+
Rules[:_HexEntity] = rule_info("HexEntity", "/&\#x/i < /[0-9a-fA-F]+/ > \";\" { rdoc_escape([text.to_i(16)].pack('U')) }")
16649+
Rules[:_DecEntity] = rule_info("DecEntity", "\"&\#\" < /[0-9]+/ > \";\" { rdoc_escape([text.to_i].pack('U')) }")
16650+
Rules[:_CharEntity] = rule_info("CharEntity", "\"&\" < /[A-Za-z0-9]+/ > \";\" { if entity = HTML_ENTITIES[text] then rdoc_escape(entity.pack('U*')) else \"&\#{text};\" end }")
1663716651
Rules[:_NonindentSpace] = rule_info("NonindentSpace", "/ {0,3}/")
1663816652
Rules[:_Indent] = rule_info("Indent", "/\\t| /")
1663916653
Rules[:_IndentedLine] = rule_info("IndentedLine", "Indent Line")

lib/rdoc/markup/inline_parser.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,10 @@ def scan_token
303303
# Returns nil if no valid URL part is found.
304304
# URL part is enclosed in square brackets and may contain escaped brackets.
305305
# Example: <tt>[http://example.com/?q=\[\]]</tt> represents <tt>http://example.com/?q=[]</tt>.
306+
# If we're accepting rdoc-style links in markdown, url may include <tt>*+<_</tt> with backslash escape.
306307

307308
def read_tidylink_url
308-
bracketed_url = strscan(/\[([^\s\[\]\\]|\\[\[\]\\])+\]/)
309+
bracketed_url = strscan(/\[([^\s\[\]\\]|\\[\[\]\\*+<_])+\]/)
309310
bracketed_url[1...-1].gsub(/\\(.)/, '\1') if bracketed_url
310311
end
311312
end

test/rdoc/markup/to_html_test.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,22 @@ def test_convert_TIDYLINK_multiple
736736
assert_equal expected, result
737737
end
738738

739+
def test_convert_TIDYLINK_url_unescape
740+
# markdown: [{label}](http://example.com/foo?q=bar+baz[])
741+
result = @to.convert '{\{label\}}[http://example.com/_foo?q=bar+baz\[\]]'
742+
expected = "\n<p><a href=\"http://example.com/_foo?q=bar+baz[]\">{label}</a></p>\n"
743+
assert_equal expected, result
744+
end
745+
746+
def test_convert_TIDYLINK_rdoc_in_markdown_url_unescape
747+
# markdown: {label}[http://example.com/?q=<+_*]
748+
# The ubove text is a plain text in markdown, so <+_* are escaped in HTML.
749+
# If we're accepting rdoc-style link in markdown, these escape should be allowed in [url] part.
750+
result = @to.convert '{label}[http://example.com/?q=\<\+\_\*]'
751+
expected = "\n<p><a href=\"http://example.com/?q=&lt;+_*\">label</a></p>\n"
752+
assert_equal expected, result
753+
end
754+
739755
def test_convert_TIDYLINK_with_code_label
740756
result = @to.convert '{Link to +Foo+}[https://example.com]'
741757

test/rdoc/rdoc_markdown_test.rb

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,11 +480,11 @@ def test_parse_emphasis_underscore
480480
end
481481

482482
def test_parse_emphasis_underscore_embedded
483-
doc = parse "foo_bar bar_baz\n"
483+
doc = parse "foo_bar bar_baz _em1_ *em2*\n"
484484

485485
expected =
486486
doc(
487-
para("foo_bar bar_baz"))
487+
para("foo\\_bar bar\\_baz _em1_ _em2_"))
488488

489489
assert_equal expected, doc
490490
end
@@ -494,15 +494,64 @@ def test_parse_emphasis_underscore_in_word
494494

495495
expected =
496496
doc(
497-
para("it foo_bar_baz"))
497+
para("it foo\\_bar\\_baz"))
498498

499499
assert_equal expected, doc
500500
end
501501

502+
def test_rdoc_code_escaped_in_normal_text
503+
doc = parse "+notcode+ \\+notcode+ \\\\+notcode+"
504+
expected = doc(para("\\+notcode\\+ \\+notcode\\+ \\\\\\+notcode\\+"))
505+
assert_equal expected, doc
506+
end
507+
508+
def test_escape_character_entities
509+
doc = parse "&#x3C;tt>&#x2A;\\</tt> &#60;tt>&#43;\\</tt> &lt;tt>&lowbar;\\</tt>"
510+
expected = doc(para("\\<tt>\\*\\</tt> \\<tt>\\+\\</tt> \\<tt>\\_\\</tt>"))
511+
assert_equal expected, doc
512+
end
513+
514+
def test_rdoc_escape_in_markdown_styling
515+
doc = parse "_a \\_b\\_ c_ **+d+** `_1+2*3`"
516+
expected = doc(para("<em>a \\_b\\_ c</em> <b>\\+d\\+</b> <code>_1+2*3</code>"))
517+
assert_equal expected, doc
518+
end
519+
520+
def test_rdoc_heading_escaped_inside_markdown
521+
doc = parse "= notheading\n"
522+
expected = doc(para("= notheading"))
523+
assert_equal expected, doc
524+
end
525+
526+
def test_rdoc_code_escaped_inside_markdown
527+
doc = parse "~~+notcode+~~"
528+
expected = doc(para("<del>\\+notcode\\+</del>"))
529+
assert_equal expected, doc
530+
end
531+
532+
def test_no_rdoc_escape_inside_markdown_code
533+
doc = parse "`+foo+`"
534+
expected = doc(para("<code>+foo+</code>"))
535+
assert_equal expected, doc
536+
end
537+
538+
def test_rdoc_format_escaped_inside_markdown_link
539+
doc = parse "[Link +to+ `tap{ +1+ }`](http://example.com/?q=[])"
540+
expected = doc(para("{Link \\+to\\+ <code>tap{ +1+ }</code>}[http://example.com/?q=\\[\\]]"))
541+
assert_equal expected, doc
542+
end
543+
544+
def test_lt_escape
545+
doc = parse "\\<b>`a`\\</b> <b>\\</b>`b`</b>"
546+
expected = doc(para("\\<b><code>a</code>\\</b> <b>\\</b><code>b</code></b>"))
547+
assert_equal expected, doc
548+
end
549+
502550
def test_parse_escape
503551
assert_equal doc(para("Backtick: `")), parse("Backtick: \\`")
504552

505-
assert_equal doc(para("Backslash: \\")), parse("Backslash: \\\\")
553+
# Unescaped as markdown and then escaped as RDoc
554+
assert_equal doc(para("Backslash: \\\\")), parse("Backslash: \\\\")
506555

507556
assert_equal doc(para("Colon: :")), parse("Colon: \\:")
508557
end

0 commit comments

Comments
 (0)