Skip to content

Commit 078294b

Browse files
committed
add markdown preprocessing to handle code blocks and code spans correctly
1 parent 5e5b87c commit 078294b

File tree

4 files changed

+595
-27
lines changed

4 files changed

+595
-27
lines changed

clang-tools-extra/clangd/SymbolDocumentation.cpp

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "clang/AST/CommentCommandTraits.h"
1414
#include "clang/AST/CommentVisitor.h"
1515
#include "llvm/ADT/DenseMap.h"
16+
#include "llvm/ADT/StringExtras.h"
1617
#include "llvm/ADT/StringRef.h"
1718

1819
namespace clang {
@@ -249,7 +250,53 @@ class BlockCommentToMarkupDocument
249250
}
250251
}
251252

253+
void visitCodeCommand(const comments::VerbatimBlockComment *VB) {
254+
std::string CodeLang = "";
255+
auto *FirstLine = VB->child_begin();
256+
// The \\code command has an optional language argument.
257+
// This argument is currently not parsed by the clang doxygen parser.
258+
// Therefore we try to extract it from the first line of the verbatim
259+
// block.
260+
if (VB->getNumLines() > 0) {
261+
if (const auto *Line =
262+
cast<comments::VerbatimBlockLineComment>(*FirstLine)) {
263+
llvm::StringRef Text = Line->getText();
264+
// Language is a single word enclosed in {}.
265+
if (llvm::none_of(Text, llvm::isSpace) && Text.consume_front("{") &&
266+
Text.consume_back("}")) {
267+
// drop a potential . since this is not supported in Markdown
268+
// fenced code blocks.
269+
Text.consume_front(".");
270+
// Language is alphanumeric or '+'.
271+
CodeLang = Text.take_while([](char C) {
272+
return llvm::isAlnum(C) || C == '+';
273+
})
274+
.str();
275+
// Skip the first line for the verbatim text.
276+
++FirstLine;
277+
}
278+
}
279+
}
280+
281+
std::string CodeBlockText;
282+
283+
for (const auto *LI = FirstLine; LI != VB->child_end(); ++LI) {
284+
if (const auto *Line = cast<comments::VerbatimBlockLineComment>(*LI)) {
285+
CodeBlockText += Line->getText().str() + "\n";
286+
}
287+
}
288+
289+
Out.addCodeBlock(CodeBlockText, CodeLang);
290+
}
291+
252292
void visitVerbatimBlockComment(const comments::VerbatimBlockComment *VB) {
293+
// The \\code command is a special verbatim block command which we handle
294+
// separately.
295+
if (VB->getCommandID() == comments::CommandTraits::KCI_code) {
296+
visitCodeCommand(VB);
297+
return;
298+
}
299+
253300
commandToMarkup(Out.addParagraph(), VB->getCommandName(Traits),
254301
VB->getCommandMarker(), "");
255302

@@ -292,6 +339,110 @@ class BlockCommentToMarkupDocument
292339
}
293340
};
294341

342+
void SymbolDocCommentVisitor::preprocessDocumentation(StringRef Doc) {
343+
enum State {
344+
Normal,
345+
FencedCodeblock,
346+
} State = Normal;
347+
std::string CodeFence;
348+
349+
llvm::raw_string_ostream OS(CommentWithMarkers);
350+
351+
// The documentation string is processed line by line.
352+
// The raw documentation string does not contain the comment markers
353+
// (e.g. /// or /** */).
354+
// But the comment lexer expects doxygen markers, so add them back.
355+
// We need to use the /// style doxygen markers because the comment could
356+
// contain the closing tag "*/" of a C Style "/** */" comment
357+
// which would break the parsing if we would just enclose the comment text
358+
// with "/** */".
359+
360+
// Escape doxygen commands inside markdown inline code spans.
361+
// This is required to not let the doxygen parser interpret them as
362+
// commands.
363+
// Note: This is a heuristic which may fail in some cases.
364+
bool InCodeSpan = false;
365+
366+
llvm::StringRef Line, Rest;
367+
for (std::tie(Line, Rest) = Doc.split('\n'); !(Line.empty() && Rest.empty());
368+
std::tie(Line, Rest) = Rest.split('\n')) {
369+
370+
// Detect code fence (``` or ~~~)
371+
if (State == Normal) {
372+
llvm::StringRef Trimmed = Line.ltrim();
373+
if (Trimmed.starts_with("```") || Trimmed.starts_with("~~~")) {
374+
// https://www.doxygen.nl/manual/markdown.html#md_fenced
375+
CodeFence =
376+
Trimmed.take_while([](char C) { return C == '`' || C == '~'; })
377+
.str();
378+
// Try to detect language: first word after fence. Could also be
379+
// enclosed in {}
380+
llvm::StringRef AfterFence =
381+
Trimmed.drop_front(CodeFence.size()).ltrim();
382+
// ignore '{' at the beginning of the language name to not duplicate it
383+
// for the doxygen command
384+
AfterFence.consume_front("{");
385+
// The name is alphanumeric or '.' or '+'
386+
StringRef CodeLang = AfterFence.take_while(
387+
[](char C) { return llvm::isAlnum(C) || C == '.' || C == '+'; });
388+
389+
OS << "///@code";
390+
391+
if (!CodeLang.empty())
392+
OS << "{" << CodeLang.str() << "}";
393+
394+
OS << "\n";
395+
396+
State = FencedCodeblock;
397+
continue;
398+
}
399+
400+
// FIXME: handle indented code blocks too?
401+
// In doxygen, the indentation which triggers a code block depends on the
402+
// indentation of the previous paragraph.
403+
// https://www.doxygen.nl/manual/markdown.html#mddox_code_blocks
404+
} else if (State == FencedCodeblock) {
405+
// End of code fence
406+
if (Line.ltrim().starts_with(CodeFence)) {
407+
OS << "///@endcode\n";
408+
State = Normal;
409+
continue;
410+
}
411+
OS << "///" << Line << "\n";
412+
continue;
413+
}
414+
415+
// Normal line preprocessing (add doxygen markers, handle escaping)
416+
OS << "///";
417+
418+
if (Line.empty() || Line.trim().empty()) {
419+
OS << "\n";
420+
// Empty lines reset the InCodeSpan state.
421+
InCodeSpan = false;
422+
continue;
423+
}
424+
425+
if (Line.starts_with("<"))
426+
// A comment line starting with '///<' is treated as a doxygen
427+
// command. To avoid this, we add a space before the '<'.
428+
OS << ' ';
429+
430+
for (char C : Line) {
431+
if (C == '`')
432+
InCodeSpan = !InCodeSpan;
433+
else if (InCodeSpan && (C == '@' || C == '\\'))
434+
OS << '\\';
435+
OS << C;
436+
}
437+
438+
OS << "\n";
439+
}
440+
441+
// Close any unclosed code block
442+
if (State == FencedCodeblock)
443+
OS << "///@endcode\n";
444+
}
445+
295446
void SymbolDocCommentVisitor::visitBlockCommandComment(
296447
const comments::BlockCommandComment *B) {
297448
switch (B->getCommandID()) {

clang-tools-extra/clangd/SymbolDocumentation.h

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "clang/AST/CommentSema.h"
2222
#include "clang/AST/CommentVisitor.h"
2323
#include "clang/Basic/SourceManager.h"
24+
#include "llvm/ADT/StringRef.h"
2425
#include "llvm/Support/raw_ostream.h"
2526
#include <string>
2627

@@ -51,31 +52,8 @@ class SymbolDocCommentVisitor
5152
CommentWithMarkers.reserve(Documentation.size() +
5253
Documentation.count('\n') * 3);
5354

54-
// The comment lexer expects doxygen markers, so add them back.
55-
// We need to use the /// style doxygen markers because the comment could
56-
// contain the closing the closing tag "*/" of a C Style "/** */" comment
57-
// which would break the parsing if we would just enclose the comment text
58-
// with "/** */".
59-
CommentWithMarkers = "///";
60-
bool NewLine = true;
61-
for (char C : Documentation) {
62-
if (C == '\n') {
63-
CommentWithMarkers += "\n///";
64-
NewLine = true;
65-
} else {
66-
if (NewLine && (C == '<')) {
67-
// A comment line starting with '///<' is treated as a doxygen
68-
// comment. Therefore add a space to separate the '<' from the comment
69-
// marker. This allows to parse html tags at the beginning of a line
70-
// and the escape marker prevents adding the artificial space in the
71-
// markup documentation. The extra space will not be rendered, since
72-
// we render it as markdown.
73-
CommentWithMarkers += ' ';
74-
}
75-
CommentWithMarkers += C;
76-
NewLine = false;
77-
}
78-
}
55+
preprocessDocumentation(Documentation);
56+
7957
SourceManagerForFile SourceMgrForFile("mock_file.cpp", CommentWithMarkers);
8058

8159
SourceManager &SourceMgr = SourceMgrForFile.get();
@@ -149,6 +127,27 @@ class SymbolDocCommentVisitor
149127
TemplateParameters[TP->getParamNameAsWritten()] = std::move(TP);
150128
}
151129

130+
/// \brief Preprocesses the raw documentation string to prepare it for doxygen
131+
/// parsing.
132+
///
133+
/// This is a workaround to provide better support for markdown in
134+
/// doxygen. Clang's doxygen parser e.g. does not handle markdown code blocks.
135+
///
136+
/// The documentation string is preprocessed to replace some markdown
137+
/// constructs with parsable doxygen commands. E.g. markdown code blocks are
138+
/// replaced with doxygen \\code{.lang} ...
139+
/// \\endcode blocks.
140+
///
141+
/// Additionally, potential doxygen commands inside markdown
142+
/// inline code spans are escaped to avoid that doxygen tries to interpret
143+
/// them as commands.
144+
///
145+
/// \note Although this is a workaround, it is very similar to what
146+
/// doxygen itself does for markdown. In doxygen, the first parsing step is
147+
/// also a markdown preprocessing step.
148+
/// See https://www.doxygen.nl/manual/markdown.html
149+
void preprocessDocumentation(StringRef Doc);
150+
152151
private:
153152
comments::CommandTraits Traits;
154153
llvm::BumpPtrAllocator Allocator;

clang-tools-extra/clangd/support/Markup.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,16 @@ bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
199199
return needsLeadingEscapeMarkdown(C, After);
200200
}
201201

202-
/// Escape a markdown text block.
202+
/// \brief Render text for markdown output.
203+
///
203204
/// If \p EscapeMarkdown is true it ensures the punctuation will not introduce
204205
/// any of the markdown constructs.
206+
///
205207
/// Else, markdown syntax is not escaped, only HTML tags and entities.
208+
/// HTML is escaped because usually clients do not support HTML rendering by
209+
/// default. Passing unescaped HTML will therefore often result in not showing
210+
/// the HTML at all.
211+
/// \note In markdown code spans, we do not escape anything.
206212
std::string renderText(llvm::StringRef Input, bool StartsLine,
207213
bool EscapeMarkdown) {
208214
std::string R;
@@ -213,6 +219,10 @@ std::string renderText(llvm::StringRef Input, bool StartsLine,
213219

214220
bool IsFirstLine = true;
215221

222+
// Inside markdown code spans, we do not escape anything when EscapeMarkdown
223+
// is false.
224+
bool InCodeSpan = false;
225+
216226
for (std::tie(Line, Rest) = Input.split('\n');
217227
!(Line.empty() && Rest.empty());
218228
std::tie(Line, Rest) = Rest.split('\n')) {
@@ -226,12 +236,27 @@ std::string renderText(llvm::StringRef Input, bool StartsLine,
226236
R.append(LeadingSpaces);
227237
}
228238

239+
// Handle the case where the user escaped a character themselves.
240+
// This is relevant for markdown code spans if EscapeMarkdown is false,
241+
// because if the user escaped a backtick, we must treat the enclosed text
242+
// as normal markdown text.
243+
bool UserEscape = false;
229244
for (unsigned I = LeadingSpaces.size(); I < Line.size(); ++I) {
230-
if (needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I),
245+
246+
if (!EscapeMarkdown && !UserEscape && Line[I] == '`')
247+
InCodeSpan = !InCodeSpan;
248+
249+
if (!InCodeSpan &&
250+
needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I),
231251
Line.substr(I + 1), StartsLineIntern,
232252
EscapeMarkdown))
233253
R.push_back('\\');
234254
R.push_back(Line[I]);
255+
256+
if (Line[I] == '\\')
257+
UserEscape = !UserEscape;
258+
else
259+
UserEscape = false;
235260
}
236261

237262
IsFirstLine = false;

0 commit comments

Comments
 (0)