55
66package software.amazon.smithy.kotlin.codegen.lang
77
8+ import org.jsoup.Jsoup
9+ import org.jsoup.nodes.Document
10+ import org.jsoup.nodes.Node
11+ import org.jsoup.nodes.TextNode
12+ import org.jsoup.select.NodeVisitor
13+ import software.amazon.smithy.codegen.core.CodegenException
814import software.amazon.smithy.kotlin.codegen.KotlinSettings
915import software.amazon.smithy.kotlin.codegen.integration.KotlinIntegration
1016import software.amazon.smithy.model.Model
1117import software.amazon.smithy.model.traits.DocumentationTrait
1218import software.amazon.smithy.model.transform.ModelTransformer
1319
1420/* *
15- * Sanitize all instances of [DocumentationTrait]
21+ * Sanitize all instances of [DocumentationTrait] and converts them to KDoc-compliant strings.
1622 */
1723class DocumentationPreprocessor : KotlinIntegration {
1824
@@ -21,17 +27,226 @@ class DocumentationPreprocessor : KotlinIntegration {
2127 return transformer.mapTraits(model) { _, trait ->
2228 when (trait) {
2329 is DocumentationTrait -> {
24- val docs = sanitize(trait.value)
30+ // There's definitely some improperly escaped HTML characters within preformat blocks in existing
31+ // models. Ensure we strip those now, the parser is VERY forgiving and will mistreat any sequences
32+ // of characters that happen to form tags as such.
33+ val sanitizedDoc = trait.value
34+ .applyWithin(" <code>" , " </code>" , String ::escapeHtml)
35+ .applyWithin(" <pre>" , " </pre>" , String ::escapeHtml)
36+ val docs = toKdoc(sanitizedDoc)
2537 DocumentationTrait (docs, trait.sourceLocation)
2638 }
2739 else -> trait
2840 }
2941 }
3042 }
3143
32- // KDoc comments use inline markdown. Replace square brackets with escaped equivalents so that they
33- // are not rendered as invalid links
34- private fun sanitize (str : String ): String =
35- str.replace(" [" , " [" )
36- .replace(" ]" , " ]" )
44+ private fun toKdoc (doc : String ): String {
45+ val parsed = parseClean(doc)
46+
47+ val renderer = MarkdownRenderer ()
48+ parsed.body().traverse(renderer)
49+ return renderer.text()
50+ }
51+
52+ private fun parseClean (rawDoc : String ): Document {
53+ val parsed = Jsoup .parse(rawDoc)
54+
55+ parsed.body().stripBlankTextNodes()
56+
57+ return parsed
58+ }
59+
60+ private class MarkdownRenderer : NodeVisitor {
61+ companion object {
62+ const val SUBLIST_INDENT = " "
63+ const val PREFORMAT_MARKER = " `"
64+ const val BOLD_MARKER = " **"
65+ const val ITALIC_MARKER = " *"
66+ }
67+
68+ private var builder: StringBuilder = StringBuilder ()
69+
70+ private var bufferedAnchorHref: String = " "
71+ private var bufferedAnchorText: String = " "
72+ private var listPrefix: String = " "
73+
74+ fun text () = builder.toString().trim()
75+
76+ override fun head (node : Node , depth : Int ) {
77+ if (node is TextNode ) {
78+ if (node.parentNode()?.nodeName() == " a" ) {
79+ bufferedAnchorText = node.markdownText()
80+ } else {
81+ builder.append(node.markdownText())
82+ }
83+ return
84+ }
85+
86+ when (node.nodeName()) {
87+ " a" -> {
88+ if (node.hasAttr((" href" ))) {
89+ bufferedAnchorHref = node.attr(" href" )
90+ }
91+ }
92+ " li" -> {
93+ // If this list item holds a sublist, then we essentially just want to line break right away and
94+ // render the nested list as normal.
95+ val prefix = if (node.childNode(0 ).nodeName() == " ul" ) " \n " else " "
96+ builder.append(" $listPrefix + $prefix " )
97+ }
98+ " ul" , " ol" -> {
99+ if (node.hasAncestor(Node ::isList)) {
100+ sublistIndent()
101+ }
102+ }
103+ " code" , " pre" -> builder.append(PREFORMAT_MARKER )
104+ " b" , " strong" -> builder.append(BOLD_MARKER )
105+ " i" , " em" -> builder.append(ITALIC_MARKER )
106+ " br" -> builder.ensureLineBreak()
107+
108+ // Definition lists (dl, dt, dd) have a corresponding md syntax, but neither intellij nor dokka will
109+ // render them. Treat definition terms as a "header" and let the descriptions flow out like
110+ // normal markdown content.
111+ " dt" -> builder.append(" ## " )
112+
113+ // Anecdotally this appears to be used to render a "title" display - it always appears at the start
114+ // of documents.
115+ " fullname" -> builder.append(" # " )
116+
117+ " body" , " p" , " note" , " important" , " dd" , " dl" , " div" -> {
118+ // Known elements that we can ignore here - they have no bearing on the output.
119+ }
120+
121+ // Occasionally there will be unescaped angle brackets within elements. Those tags will sometimes
122+ // join to form "elements" that will trick the rather forgiving Jsoup parser, eg.
123+ // "<p>specify the URI in the form 's3://<bucket_name>/</p>"
124+ // The safest approach to malformed input like this is just to write out the content as-is, such
125+ // that no information is destroyed. The worst outcome is that some seemingly nonsense HTML tag is
126+ // injected into the output, which a reader can reasonably ignore.
127+ else -> builder.append(" <${node.nodeName()} >" )
128+ }
129+ }
130+
131+ override fun tail (node : Node , depth : Int ) {
132+ when (node.nodeName()) {
133+ " p" , " div" , " dd" -> {
134+ val nextSibling = node.nextSibling()
135+ when {
136+ // break to give the upcoming list a new line
137+ nextSibling != null && nextSibling.isList() -> builder.ensureLineBreak()
138+ // if we're inside a list, the outer list item will close out the line for us
139+ node.hasAncestor(Node ::isList) -> return
140+ // all other cases: this is a standalone "text block" which should be displayed as its own
141+ // paragraph
142+ else -> builder.ensureSectionBreak()
143+ }
144+ }
145+ " a" -> writeBufferedAnchor()
146+ " ul" , " ol" -> {
147+ sublistDedent()
148+ if (node.parent()?.nodeName() == " body" ) {
149+ builder.ensureSectionBreak()
150+ }
151+ }
152+ " code" , " pre" -> builder.append(PREFORMAT_MARKER )
153+ " b" , " strong" -> builder.append(BOLD_MARKER )
154+ " i" , " em" -> builder.append(ITALIC_MARKER )
155+ " li" , " fullname" , " dt" -> builder.ensureLineBreak()
156+ }
157+ }
158+
159+ private fun writeBufferedAnchor () {
160+ // Model docs will sometimes contain an anchor without the href. At that point there's no real way of
161+ // knowing to what it refers, nor can we guarantee a valid link just by bracketing it.
162+ builder.append(
163+ if (bufferedAnchorHref != " " ) {
164+ " [$bufferedAnchorText ]($bufferedAnchorHref )"
165+ } else {
166+ bufferedAnchorText
167+ }
168+ )
169+
170+ bufferedAnchorHref = " "
171+ bufferedAnchorText = " "
172+ }
173+
174+ private fun sublistIndent () {
175+ listPrefix + = SUBLIST_INDENT
176+ }
177+
178+ private fun sublistDedent () {
179+ listPrefix = listPrefix.dropLast(SUBLIST_INDENT .length)
180+ }
181+ }
182+ }
183+
184+ /* *
185+ * Jsoup will preserve newlines between elements as blank text nodes. These have zero bearing on the content of the
186+ * document to begin with and only serve to complicate traversal.
187+ */
188+ private fun Node.stripBlankTextNodes () {
189+ if (this is TextNode && isBlank) {
190+ remove()
191+ return
192+ }
193+
194+ childNodes().forEach(Node ::stripBlankTextNodes)
195+ }
196+
197+ private fun Node.hasAncestor (predicate : (Node ) -> Boolean ): Boolean =
198+ parent()?.let { predicate(it) || it.hasAncestor(predicate) } == true
199+
200+ private fun Node.isList () =
201+ nodeName().let { it == " ul" || it == " ol" }
202+
203+ private fun TextNode.markdownText () =
204+ text()
205+ // Replace square brackets with escaped equivalents so that they are not rendered as invalid Markdown
206+ // links.
207+ .replace(" [" , " [" )
208+ .replace(" ]" , " ]" )
209+
210+ /* *
211+ * Operates on all substrings that fall within the provided section delimiters. Returns a new string where all
212+ * substrings enclosed as specified have been modified according to the provided transform.
213+ *
214+ * This extension is not intended to handle nested sections, and will throw if it encounters any.
215+ */
216+ private fun String.applyWithin (start : String , end : String , transform : (String ) -> String ): String {
217+ val startIndex = indexOf(start)
218+ if (startIndex == - 1 ) return this
219+
220+ val substringStart = indexOf(start) + start.length
221+ val substringEnd = indexOf(end, substringStart)
222+ if (substringEnd == - 1 ) return this
223+
224+ val stringToTransform = substring(substringStart, substringEnd)
225+ if (stringToTransform.indexOf(start) != - 1 ) {
226+ throw CodegenException (" string contains nested start delimiter" )
227+ }
228+
229+ return substring(0 , substringStart) + transform(stringToTransform) + end +
230+ substring(substringEnd + end.length).applyWithin(start, end, transform)
231+ }
232+
233+ private fun String.escapeHtml () =
234+ replace(" &" , " &" )
235+ .replace(" <" , " <" )
236+ .replace(" >" , " >" )
237+
238+ private fun StringBuilder.ensureLineBreak () {
239+ if (! endsWith(" \n " )) {
240+ appendLine()
241+ }
242+ }
243+
244+ private fun StringBuilder.ensureSectionBreak () {
245+ if (endsWith(" \n\n " )) return
246+
247+ if (endsWith(" \n " )) {
248+ appendLine()
249+ } else {
250+ append(" \n\n " )
251+ }
37252}
0 commit comments