Skip to content

Commit df93ce4

Browse files
authored
Merge pull request #113 from ahrefs/yasu/pp-mrkdwn
Replace omd `to_markdown` with pretty-printer for mrkdwn
2 parents 59dce79 + 51385ff commit df93ce4

File tree

3 files changed

+196
-56
lines changed

3 files changed

+196
-56
lines changed

lib/mrkdwn.ml

Lines changed: 193 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
open Omd
22
open Base
33

4-
let escape_url_chars = Staged.unstage @@ String.Escaping.escape ~escapeworthy:[ '<'; '>'; '|' ] ~escape_char:'\\'
5-
64
(* https://api.slack.com/reference/surfaces/formatting#escaping *)
75
let escape_mrkdwn =
86
String.concat_map ~f:(function
@@ -11,56 +9,198 @@ let escape_mrkdwn =
119
| '&' -> "&amp;"
1210
| c -> String.make 1 c)
1311

14-
(** Unescape markdown characters escaped because "any ASCII punctuation
15-
character may be backslash-escaped"
16-
https://spec.commonmark.org/0.30/#backslash-escapes
17-
18-
This pertains to '\\', '[', ']', '(', ')', '`', '*' unconditionally,
19-
and '.', '-', '+', '!', '<', '>', '#' depending on chars before/after.
20-
Argument escapeworthy_map can be left blank because escaped chars are
21-
unescaped to themselves. *)
22-
let unescape_omd = Staged.unstage @@ String.Escaping.unescape_gen_exn ~escapeworthy_map:[] ~escape_char:'\\'
23-
24-
(** Escape the `escape_char` '\\' for use with `unescape_omd` later *)
25-
let escape_omd = Staged.unstage @@ String.Escaping.escape_gen_exn ~escapeworthy_map:[] ~escape_char:'\\'
26-
27-
let transform_text = escape_mrkdwn
28-
29-
(** `Omd.to_markdown` escapes backslash (and other applicable chars) in
30-
`Text` elements but not `Code` elements, so do the same for the latter so
31-
that `unescape_omd` can apply uniformly to the whole mrkdwn string later *)
32-
let transform_code s = escape_omd @@ escape_mrkdwn s
33-
34-
let rec transform_list = List.map ~f:transform
35-
36-
and transform_flatten = List.map ~f:transform_list
37-
38-
and surround s t =
39-
let t = to_markdown @@ transform_list t in
40-
Raw (Printf.sprintf "%s%s%s" s t s)
12+
(** Translates omd AST to a Slack mrkdwn string. Code heavily adapted
13+
from omd 1.3.1 source.
14+
https://github.com/ocaml/omd/blob/1.3.1/src/omd_backend.ml#L872
15+
*)
16+
let rec mrkdwn_of_md md =
17+
let b = Buffer.create 128 in
18+
let references : ref_container option ref = ref None in
19+
let nl b = Buffer.add_char b '\n' in
20+
let nl_if_needed_above b =
21+
if Buffer.length b > 0 && (not @@ Char.equal '\n' (Buffer.nth b (Buffer.length b - 1))) then nl b
22+
in
23+
let add_spaces n =
24+
for _i = 1 to n do
25+
Buffer.add_char b ' '
26+
done
27+
in
28+
let rec loop ?(fst_p_in_li = true) ?(is_in_list = false) list_indent =
29+
(* [list_indent: int] is the indentation level in number of spaces.
30+
[fst_p_in_li: bool] is used to apply different indentation to the first
31+
paragraph in a list items.
32+
[is_in_list: bool] is necessary to know if we are inside a paragraph
33+
which is inside a list item because those need to be indented!
34+
*)
35+
let loop ?(fst_p_in_li = fst_p_in_li) ?(is_in_list = is_in_list) list_indent l =
36+
loop ~fst_p_in_li ~is_in_list list_indent l
37+
in
38+
function
39+
| [] -> ()
40+
| el :: tl ->
41+
match el with
42+
| X _ -> loop list_indent tl
43+
| Blockquote q ->
44+
(* mrkdwn doesn't support nested quotes, but output '>' chars anyway*)
45+
let quote s =
46+
let b = Buffer.create (String.length s) in
47+
let l = String.length s in
48+
let rec loop is_nl i =
49+
if i < l then begin
50+
if is_nl && i < l - 1 then Buffer.add_string b "> ";
51+
match s.[i] with
52+
| '\n' ->
53+
nl b;
54+
loop true (i + 1)
55+
| c ->
56+
Buffer.add_char b c;
57+
loop false (i + 1)
58+
end
59+
else Buffer.contents b
60+
in
61+
loop true 0
62+
in
63+
Buffer.add_string b (quote @@ mrkdwn_of_md q);
64+
if not @@ List.is_empty tl then nl_if_needed_above b;
65+
loop list_indent tl
66+
| Ref (rc, _name, _text, fallback) | Img_ref (rc, _name, _text, fallback) ->
67+
(* [rc] stores all refs from document, so it's enough to record just the
68+
first encounter
69+
*)
70+
if Option.is_empty !references then references := Some rc;
71+
(* [fallback#to_string] renders as
72+
[<text>][<name>] for Ref, e.g., [interesting fact][1]
73+
and
74+
![<text>][<name>] for Img_ref, e.g., ![image of cat][1]
75+
*)
76+
loop list_indent (Raw fallback#to_string :: tl)
77+
| Paragraph [] -> loop list_indent tl
78+
| Paragraph md ->
79+
(* indent if inside a list (Olp or Ulp) *)
80+
if is_in_list then if fst_p_in_li then add_spaces (list_indent - 2) else add_spaces list_indent;
81+
(* paragraph body + skip line *)
82+
loop ~fst_p_in_li:false list_indent md;
83+
nl b;
84+
nl b;
85+
loop ~fst_p_in_li:false list_indent tl
86+
| Img (alt, src, title) -> loop list_indent (Url (src, [ Text alt ], title) :: tl)
87+
| Text t ->
88+
Buffer.add_string b @@ escape_mrkdwn t;
89+
loop list_indent tl
90+
| Raw s ->
91+
Buffer.add_string b s;
92+
loop list_indent tl
93+
| Raw_block s ->
94+
nl b;
95+
Buffer.add_string b s;
96+
nl b;
97+
loop list_indent tl
98+
| Emph md' ->
99+
Buffer.add_string b "_";
100+
loop list_indent md';
101+
Buffer.add_string b "_";
102+
loop list_indent tl
103+
| Bold md' ->
104+
Buffer.add_string b "*";
105+
loop list_indent md';
106+
Buffer.add_string b "*";
107+
loop list_indent tl
108+
| Ul l ->
109+
nl_if_needed_above b;
110+
List.iter l ~f:(fun li ->
111+
add_spaces list_indent;
112+
Buffer.add_string b "- ";
113+
loop ~is_in_list:true (list_indent + 4) li;
114+
nl_if_needed_above b);
115+
if list_indent = 0 then nl b;
116+
loop list_indent tl
117+
| Ol l ->
118+
nl_if_needed_above b;
119+
List.iteri l ~f:(fun i li ->
120+
add_spaces list_indent;
121+
Printf.bprintf b "%d. " (i + 1);
122+
loop ~is_in_list:true (list_indent + 4) li;
123+
nl_if_needed_above b);
124+
if list_indent = 0 then nl b;
125+
loop list_indent tl
126+
| Ulp l ->
127+
List.iter l ~f:(fun li ->
128+
nl_if_needed_above b;
129+
add_spaces list_indent;
130+
Buffer.add_string b "- ";
131+
loop ~is_in_list:true (list_indent + 4) li (* Paragraphs => No need of '\n' *));
132+
loop list_indent tl
133+
| Olp l ->
134+
List.iteri l ~f:(fun i li ->
135+
nl_if_needed_above b;
136+
add_spaces list_indent;
137+
Printf.bprintf b "%d. " i;
138+
loop ~is_in_list:true (list_indent + 4) li (* Paragraphs => No need of '\n' *));
139+
loop list_indent tl
140+
| Code_block (_lang, c) ->
141+
(* unlike commonmark, can't have code block inside lists, so print code block with
142+
zero indent, but continue rest of the list at correct indent after
41143
42-
(** massage markdown AST so that rendered result looks like slack mrkdwn *)
43-
and transform = function
44-
| H1 t | H2 t | H3 t | H4 t | H5 t | H6 t -> Paragraph (transform_list [ Bold t ])
45-
| Paragraph t -> Paragraph (transform_list t)
46-
| Emph t -> surround "_" (transform_list t)
47-
| Bold t -> surround "*" (transform_list t)
48-
| Ul ts -> Ul (transform_flatten ts)
49-
| Ol ts -> Ol (transform_flatten ts)
50-
| Ulp ts -> Ulp (transform_flatten ts)
51-
| Olp ts -> Olp (transform_flatten ts)
52-
| Url (href, label, title) ->
53-
let label = escape_url_chars @@ to_markdown @@ transform_list label in
54-
let title = if String.length title > 0 then Printf.sprintf "%s - " @@ escape_url_chars title else title in
55-
Raw (Printf.sprintf "<%s|%s%s>" href title label)
56-
| Html _ as e -> Raw (Printf.sprintf "`%s`" @@ to_markdown [ e ])
57-
| Html_comment _ -> Br
58-
| Html_block _ as e -> Code_block ("", to_markdown [ e ])
59-
| Blockquote t -> Blockquote (transform_list t)
60-
| Img (alt, src, title) -> transform @@ Url (src, [ Text alt ], title)
61-
| Code_block (_, str) -> Code_block ("", transform_code str)
62-
| Code (_, str) -> Code ("", transform_code str)
63-
| Text s -> Text (transform_text s)
64-
| (Br | Hr | NL | Ref _ | Img_ref _ | Raw _ | Raw_block _ | X _) as e -> e
144+
note: sometimes indentation intended as list item paragraph is wrongly
145+
interpreted as code block - an issue with Omd.of_string
146+
e.g., both should be parsed the second way, but aren't:
147+
# of_string " - foo\n\n bar";;
148+
- : t = [Ul [[Text "foo"]]; NL; NL; Code_block ("", "bar")]
149+
# of_string "- foo\n\n bar";;
150+
- : t = [Ulp [[Paragraph [Text "foo"]; Paragraph [Text "bar"]]]]
151+
*)
152+
nl_if_needed_above b;
153+
Buffer.add_string b "```\n";
154+
Buffer.add_string b (escape_mrkdwn c);
155+
nl_if_needed_above b;
156+
Buffer.add_string b "```\n";
157+
loop list_indent tl
158+
| Code (_lang, c) ->
159+
(* sadly, slack mrkdwn has no way to escape backticks within in-line code,
160+
so broken markup is unavoidable
161+
*)
162+
Buffer.add_char b '`';
163+
Buffer.add_string b (escape_mrkdwn c);
164+
Buffer.add_char b '`';
165+
loop list_indent tl
166+
| Hr ->
167+
Buffer.add_string b "* * *\n";
168+
loop list_indent tl
169+
| Html (_tagname, _attrs, _body) as html -> loop list_indent (Code ("", to_html [ html ]) :: tl)
170+
| Html_block (_tagname, _attrs, _body) as html -> loop list_indent (Code_block ("", to_html [ html ]) :: tl)
171+
| Html_comment _s -> loop list_indent tl
172+
| Url (href, s, title) ->
173+
Buffer.add_char b '<';
174+
Buffer.add_string b href;
175+
Buffer.add_char b '|';
176+
if String.length title > 0 then Printf.bprintf b "%s - " @@ escape_mrkdwn title;
177+
loop list_indent s;
178+
Buffer.add_char b '>';
179+
loop list_indent tl
180+
| H1 md' | H2 md' | H3 md' | H4 md' | H5 md' | H6 md' -> loop list_indent (Paragraph [ Bold md' ] :: tl)
181+
| Br ->
182+
(* the string "\\n" (backslash-newline) or end of line double-space renders Br *)
183+
nl b;
184+
loop list_indent tl
185+
| NL ->
186+
(* the string "\n" renders NL *)
187+
nl_if_needed_above b;
188+
loop list_indent tl
189+
in
190+
(* print the document *)
191+
loop 0 md;
192+
(* print any references *)
193+
begin
194+
match !references with
195+
| None -> ()
196+
| Some r ->
197+
let print_ref (name, (url, title)) =
198+
if String.equal title "" then Printf.bprintf b "[%s]: %s \n" name url
199+
else Printf.bprintf b "[%s]: %s \"%s\"\n" name url title
200+
in
201+
nl b;
202+
List.iter ~f:print_ref r#get_all
203+
end;
204+
Buffer.contents b
65205

66-
let mrkdwn_of_markdown str = unescape_omd @@ to_markdown @@ transform_list @@ of_string str
206+
let mrkdwn_of_markdown str = mrkdwn_of_md @@ of_string str

mock_payloads/commit_comment.mrkdwn_comment.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"created_at": "2020-06-07T15:11:50Z",
3333
"updated_at": "2020-06-07T15:11:50Z",
3434
"author_association": "OWNER",
35-
"body": "**bold**, *italic*\n> blockquote\n\n* list-element 1\n* list-element 2\n * list-element2.1\n * [list-element2.2](www.google.com)\n\n<div>html block</div>\n```ocaml\nStdio.printf \"hello ocaml\"\n```\nescaped chars \\\\\\(\\)\\[\\]\\`\\*\n`keep \\`\n```\nkeep \\\n```"
35+
"body": "# header 1\r\n## header 2\r\n### header 3\r\n#### header 4\r\n##### header 5\r\n###### header 6\r\n\r\n***\r\n\r\nparagraph 1 **bold**, _italic_, ~strikethrough~, **_bold italic_**, _**italic bold**_\r\n\r\nparagraph 2\r\nhard line break\r\n\r\n> blockquote\r\n\r\n> > > multi\r\n> >\r\n> > level\r\n>\r\n> blockquote\r\n\r\n* list-element 1\r\n* list-element 2\r\n * list-element 2.1\r\n * list-element 2.1.1\r\n * [list-element 2.2](www.google.com)\r\n\r\n1. list-element 1\r\n1. list-element 2\r\n 1. list-element 2.1\r\n 1. list-element 2.1.1\r\n 1. [list-element 2.2](www.google.com)\r\n1. list-element 3\r\n\r\n[some link text](www.google.com \"link with title\")\r\n\r\n* > quote in list\r\n\r\n> * list in quote\r\n\r\n![image](https://via.placeholder.com/20x20)\r\n\r\n![image](https://via.placeholder.com/20x20 \"with title\")\r\n\r\n![](https://via.placeholder.com/20x20 \"no alt\")\r\n\r\n![](https://via.placeholder.com/20x20)\r\n\r\n<div>html block with <i>styling</i></div>\r\n\r\n```ocaml\r\nStdio.printf \"hello ocaml\"\r\n```\r\n\r\ninline `code block`\r\n\r\nescape these \\\\\\(\\)\\[\\]\\`\\* `but keep \\`\r\n\r\n```\r\nand keep \\\r\n```\r\n"
3636
},
3737
"repository": {
3838
"id": 0,

test/slack_payloads.expected

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ will notify #all-push-events
9595
"pretext":
9696
"<https://github.com/xinyuluo/monorepo|[xinyuluo/monorepo]> *xinyuluo* commented on `<https://github.com/xinyuluo/monorepo/commit/cd5b85afa306840e0790b62e349ee1f828b2a3c2#commitcomment-0|cd5b85af>` add new line at EOF",
9797
"text":
98-
"*bold*, _italic_\n\n> blockquote\n\n\n- list-element 1\n- list-element 2\n - list-element2.1\n - <www.google.com|list-element2.2>\n\n\n```\n<div>html block</div>\n```\n```\nStdio.printf \"hello ocaml\"\n```\nescaped chars \\()[]`*\n`keep \\`\n\n```\nkeep \\\n```",
98+
"*header 1*\n\n*header 2*\n\n*header 3*\n\n*header 4*\n\n*header 5*\n\n*header 6*\n\n* * *\nparagraph 1 *bold*, _italic_, ~strikethrough~, *_bold italic_*, _*italic bold*_\n\nparagraph 2\nhard line break\n\n> blockquote\n\n> > > multi\n> > \n> > level\n> \n> blockquote\n\n- list-element 1\n- list-element 2\n - list-element 2.1\n - list-element 2.1.1\n - <www.google.com|list-element 2.2>\n\n1. list-element 1\n2. list-element 2\n 1. list-element 2.1\n 1. list-element 2.1.1\n 2. <www.google.com|list-element 2.2>\n3. list-element 3\n\n<www.google.com|link with title - some link text>\n\n- > quote in list\n\n> - list in quote\n\n<https://via.placeholder.com/20x20|image>\n\n<https://via.placeholder.com/20x20|with title - image>\n\n<https://via.placeholder.com/20x20|no alt - >\n\n<https://via.placeholder.com/20x20|>\n\n```\n&lt;div&gt;html block with &lt;i&gt;styling&lt;/i&gt;&lt;/div&gt;\n```\n```\nStdio.printf \"hello ocaml\"\n```\ninline `code block`\n\nescape these \\()[]`* `but keep \\`\n\n```\nand keep \\\n```",
9999
"footer":
100100
"New comment by xinyuluo in <https://github.com/xinyuluo/monorepo/commit/cd5b85afa306840e0790b62e349ee1f828b2a3c2#commitcomment-0|main.ml>"
101101
}
@@ -196,7 +196,7 @@ will notify #frontend-bot
196196
"pretext":
197197
"<https://github.com/ahrefs/monorepo|[ahrefs/monorepo]> *yasunariw* <https://github.com/ahrefs/monorepo/pull/6168#issuecomment-17123|commented> on #6168 <https://github.com/ahrefs/monorepo/pull/6168|Feature 1234>",
198198
"text":
199-
"> I do not understand this\n\n\nDo you have access to the logs on the website?"
199+
"> I do not understand this\n\nDo you have access to the logs on the website?"
200200
}
201201
]
202202
}

0 commit comments

Comments
 (0)