Skip to content

Commit 26aa156

Browse files
authored
feat: convert html to markdown for improved dokka readability (#634)
1 parent 0f9311a commit 26aa156

File tree

8 files changed

+734
-59
lines changed

8 files changed

+734
-59
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"id": "74cc4273-d134-4b61-9f78-2ed740f3bd9a",
3+
"type": "bugfix",
4+
"description": "Convert HTML to Markdown for improved Dokka compatibility.",
5+
"issues": [
6+
"awslabs/smithy-kotlin#136"
7+
]
8+
}

gradle.properties

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ coroutinesVersion=1.6.0
1616
ktorVersion=1.6.7
1717
atomicFuVersion=0.17.0
1818
kotlinxSerializationVersion=1.3.0
19+
jsoupVersion=1.14.1
1920

2021
# codegen
2122
smithyVersion=1.17.0
@@ -39,4 +40,4 @@ kotlinLoggingVersion=2.0.3
3940
slf4jVersion=1.7.30
4041

4142
# crt
42-
crtKotlinVersion=0.5.4
43+
crtKotlinVersion=0.5.4

smithy-kotlin-codegen/build.gradle.kts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ val smithyVersion: String by project
2020
val kotlinVersion: String by project
2121
val junitVersion: String by project
2222
val kotestVersion: String by project
23+
val jsoupVersion: String by project
2324

2425
dependencies {
2526
implementation(kotlin("stdlib-jdk8"))
2627
api("software.amazon.smithy:smithy-codegen-core:$smithyVersion")
2728
api("software.amazon.smithy:smithy-waiters:$smithyVersion")
2829
implementation("software.amazon.smithy:smithy-aws-traits:$smithyVersion")
2930
implementation("software.amazon.smithy:smithy-protocol-test-traits:$smithyVersion")
31+
implementation("org.jsoup:jsoup:$jsoupVersion")
3032

3133
// Test dependencies
3234
// These are not set as test dependencies so they can be shared with other modules

smithy-kotlin-codegen/src/main/kotlin/software/amazon/smithy/kotlin/codegen/core/KotlinWriter.kt

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,22 @@ class KotlinWriter(
184184
fun dokka(docs: String): KotlinWriter =
185185
dokka {
186186
write(
187-
formatDocumentation(
188-
sanitizeDocumentation(docs)
187+
cleanForWriter(
188+
formatDocumentation(docs)
189189
)
190190
)
191191
}
192192

193+
/**
194+
* Clean/escape any content from the doc that would invalidate the Kotlin output.
195+
*/
196+
private fun cleanForWriter(doc: String) = doc
197+
// Docs can have valid # characters that shouldn't run through formatters.
198+
.replace("#", "##")
199+
// Services may have comment string literals embedded in documentation.
200+
.replace("/*", "&##47;*")
201+
.replace("*/", "*&##47;")
202+
193203
/**
194204
* Adds appropriate annotations to generated declarations.
195205
*/
@@ -341,36 +351,6 @@ class InlineKotlinWriterFormatter(private val parent: KotlinWriter) : BiFunction
341351
}
342352
}
343353

344-
// Most commonly occurring (but not exhaustive) set of HTML tags found in AWS models
345-
private val commonHtmlTags = setOf(
346-
"a",
347-
"b",
348-
"code",
349-
"dd",
350-
"dl",
351-
"dt",
352-
"i",
353-
"important",
354-
"li",
355-
"note",
356-
"p",
357-
"strong",
358-
"ul"
359-
).map { listOf("<$it>", "</$it>") }.flatten()
360-
361-
// Replace characters in the input documentation to prevent issues in codegen or rendering.
362-
// NOTE: Currently we look for specific strings of Html tags commonly found in docs
363-
// and remove them. A better solution would be to generally convert from HTML to "pure"
364-
// markdown such that formatting is preserved.
365-
// TODO: https://github.com/awslabs/smithy-kotlin/issues/136
366-
private fun sanitizeDocumentation(doc: String): String = doc
367-
.stripAll(commonHtmlTags)
368-
// Docs can have valid $ characters that shouldn't run through formatters.
369-
.replace("#", "##")
370-
// Services may have comment string literals embedded in documentation.
371-
.replace("/*", "&##47;*")
372-
.replace("*/", "*&##47;")
373-
374354
// Remove all strings from source string and return the result
375355
private fun String.stripAll(stripList: List<String>): String {
376356
var newStr = this
@@ -379,15 +359,14 @@ private fun String.stripAll(stripList: List<String>): String {
379359
return newStr
380360
}
381361

382-
// Remove whitespace from the beginning and end of each line of documentation
383362
// Remove leading, trailing, and consecutive blank lines
384363
private fun formatDocumentation(doc: String, lineSeparator: String = "\n") =
385364
doc
386365
.split('\n') // Break the doc into lines
387366
.dropWhile { it.isBlank() } // Drop leading blank lines
388367
.dropLastWhile { it.isBlank() } // Drop trailing blank lines
389368
.dropConsecutive { it.isBlank() } // Remove consecutive empty lines
390-
.joinToString(separator = lineSeparator) { it.trim() } // Trim line
369+
.joinToString(separator = lineSeparator)
391370

392371
/**
393372
* Filters out consecutive items matching the given [predicate].

smithy-kotlin-codegen/src/main/kotlin/software/amazon/smithy/kotlin/codegen/lang/DocumentationPreprocessor.kt

Lines changed: 222 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,20 @@
55

66
package software.amazon.smithy.kotlin.codegen.lang
77

8+
import org.jsoup.Jsoup
9+
import org.jsoup.nodes.Document
10+
import org.jsoup.nodes.Node
11+
import org.jsoup.nodes.TextNode
12+
import org.jsoup.select.NodeVisitor
13+
import software.amazon.smithy.codegen.core.CodegenException
814
import software.amazon.smithy.kotlin.codegen.KotlinSettings
915
import software.amazon.smithy.kotlin.codegen.integration.KotlinIntegration
1016
import software.amazon.smithy.model.Model
1117
import software.amazon.smithy.model.traits.DocumentationTrait
1218
import software.amazon.smithy.model.transform.ModelTransformer
1319

1420
/**
15-
* Sanitize all instances of [DocumentationTrait]
21+
* Sanitize all instances of [DocumentationTrait] and converts them to KDoc-compliant strings.
1622
*/
1723
class DocumentationPreprocessor : KotlinIntegration {
1824

@@ -21,17 +27,226 @@ class DocumentationPreprocessor : KotlinIntegration {
2127
return transformer.mapTraits(model) { _, trait ->
2228
when (trait) {
2329
is DocumentationTrait -> {
24-
val docs = sanitize(trait.value)
30+
// There's definitely some improperly escaped HTML characters within preformat blocks in existing
31+
// models. Ensure we strip those now, the parser is VERY forgiving and will mistreat any sequences
32+
// of characters that happen to form tags as such.
33+
val sanitizedDoc = trait.value
34+
.applyWithin("<code>", "</code>", String::escapeHtml)
35+
.applyWithin("<pre>", "</pre>", String::escapeHtml)
36+
val docs = toKdoc(sanitizedDoc)
2537
DocumentationTrait(docs, trait.sourceLocation)
2638
}
2739
else -> trait
2840
}
2941
}
3042
}
3143

32-
// KDoc comments use inline markdown. Replace square brackets with escaped equivalents so that they
33-
// are not rendered as invalid links
34-
private fun sanitize(str: String): String =
35-
str.replace("[", "&#91;")
36-
.replace("]", "&#93;")
44+
private fun toKdoc(doc: String): String {
45+
val parsed = parseClean(doc)
46+
47+
val renderer = MarkdownRenderer()
48+
parsed.body().traverse(renderer)
49+
return renderer.text()
50+
}
51+
52+
private fun parseClean(rawDoc: String): Document {
53+
val parsed = Jsoup.parse(rawDoc)
54+
55+
parsed.body().stripBlankTextNodes()
56+
57+
return parsed
58+
}
59+
60+
private class MarkdownRenderer : NodeVisitor {
61+
companion object {
62+
const val SUBLIST_INDENT = " "
63+
const val PREFORMAT_MARKER = "`"
64+
const val BOLD_MARKER = "**"
65+
const val ITALIC_MARKER = "*"
66+
}
67+
68+
private var builder: StringBuilder = StringBuilder()
69+
70+
private var bufferedAnchorHref: String = ""
71+
private var bufferedAnchorText: String = ""
72+
private var listPrefix: String = ""
73+
74+
fun text() = builder.toString().trim()
75+
76+
override fun head(node: Node, depth: Int) {
77+
if (node is TextNode) {
78+
if (node.parentNode()?.nodeName() == "a") {
79+
bufferedAnchorText = node.markdownText()
80+
} else {
81+
builder.append(node.markdownText())
82+
}
83+
return
84+
}
85+
86+
when (node.nodeName()) {
87+
"a" -> {
88+
if (node.hasAttr(("href"))) {
89+
bufferedAnchorHref = node.attr("href")
90+
}
91+
}
92+
"li" -> {
93+
// If this list item holds a sublist, then we essentially just want to line break right away and
94+
// render the nested list as normal.
95+
val prefix = if (node.childNode(0).nodeName() == "ul") "\n" else ""
96+
builder.append("$listPrefix+ $prefix")
97+
}
98+
"ul", "ol" -> {
99+
if (node.hasAncestor(Node::isList)) {
100+
sublistIndent()
101+
}
102+
}
103+
"code", "pre" -> builder.append(PREFORMAT_MARKER)
104+
"b", "strong" -> builder.append(BOLD_MARKER)
105+
"i", "em" -> builder.append(ITALIC_MARKER)
106+
"br" -> builder.ensureLineBreak()
107+
108+
// Definition lists (dl, dt, dd) have a corresponding md syntax, but neither intellij nor dokka will
109+
// render them. Treat definition terms as a "header" and let the descriptions flow out like
110+
// normal markdown content.
111+
"dt" -> builder.append("## ")
112+
113+
// Anecdotally this appears to be used to render a "title" display - it always appears at the start
114+
// of documents.
115+
"fullname" -> builder.append("# ")
116+
117+
"body", "p", "note", "important", "dd", "dl", "div" -> {
118+
// Known elements that we can ignore here - they have no bearing on the output.
119+
}
120+
121+
// Occasionally there will be unescaped angle brackets within elements. Those tags will sometimes
122+
// join to form "elements" that will trick the rather forgiving Jsoup parser, eg.
123+
// "<p>specify the URI in the form 's3://<bucket_name>/</p>"
124+
// The safest approach to malformed input like this is just to write out the content as-is, such
125+
// that no information is destroyed. The worst outcome is that some seemingly nonsense HTML tag is
126+
// injected into the output, which a reader can reasonably ignore.
127+
else -> builder.append("<${node.nodeName()}>")
128+
}
129+
}
130+
131+
override fun tail(node: Node, depth: Int) {
132+
when (node.nodeName()) {
133+
"p", "div", "dd" -> {
134+
val nextSibling = node.nextSibling()
135+
when {
136+
// break to give the upcoming list a new line
137+
nextSibling != null && nextSibling.isList() -> builder.ensureLineBreak()
138+
// if we're inside a list, the outer list item will close out the line for us
139+
node.hasAncestor(Node::isList) -> return
140+
// all other cases: this is a standalone "text block" which should be displayed as its own
141+
// paragraph
142+
else -> builder.ensureSectionBreak()
143+
}
144+
}
145+
"a" -> writeBufferedAnchor()
146+
"ul", "ol" -> {
147+
sublistDedent()
148+
if (node.parent()?.nodeName() == "body") {
149+
builder.ensureSectionBreak()
150+
}
151+
}
152+
"code", "pre" -> builder.append(PREFORMAT_MARKER)
153+
"b", "strong" -> builder.append(BOLD_MARKER)
154+
"i", "em" -> builder.append(ITALIC_MARKER)
155+
"li", "fullname", "dt" -> builder.ensureLineBreak()
156+
}
157+
}
158+
159+
private fun writeBufferedAnchor() {
160+
// Model docs will sometimes contain an anchor without the href. At that point there's no real way of
161+
// knowing to what it refers, nor can we guarantee a valid link just by bracketing it.
162+
builder.append(
163+
if (bufferedAnchorHref != "") {
164+
"[$bufferedAnchorText]($bufferedAnchorHref)"
165+
} else {
166+
bufferedAnchorText
167+
}
168+
)
169+
170+
bufferedAnchorHref = ""
171+
bufferedAnchorText = ""
172+
}
173+
174+
private fun sublistIndent() {
175+
listPrefix += SUBLIST_INDENT
176+
}
177+
178+
private fun sublistDedent() {
179+
listPrefix = listPrefix.dropLast(SUBLIST_INDENT.length)
180+
}
181+
}
182+
}
183+
184+
/**
185+
* Jsoup will preserve newlines between elements as blank text nodes. These have zero bearing on the content of the
186+
* document to begin with and only serve to complicate traversal.
187+
*/
188+
private fun Node.stripBlankTextNodes() {
189+
if (this is TextNode && isBlank) {
190+
remove()
191+
return
192+
}
193+
194+
childNodes().forEach(Node::stripBlankTextNodes)
195+
}
196+
197+
private fun Node.hasAncestor(predicate: (Node) -> Boolean): Boolean =
198+
parent()?.let { predicate(it) || it.hasAncestor(predicate) } == true
199+
200+
private fun Node.isList() =
201+
nodeName().let { it == "ul" || it == "ol" }
202+
203+
private fun TextNode.markdownText() =
204+
text()
205+
// Replace square brackets with escaped equivalents so that they are not rendered as invalid Markdown
206+
// links.
207+
.replace("[", "&#91;")
208+
.replace("]", "&#93;")
209+
210+
/**
211+
* Operates on all substrings that fall within the provided section delimiters. Returns a new string where all
212+
* substrings enclosed as specified have been modified according to the provided transform.
213+
*
214+
* This extension is not intended to handle nested sections, and will throw if it encounters any.
215+
*/
216+
private fun String.applyWithin(start: String, end: String, transform: (String) -> String): String {
217+
val startIndex = indexOf(start)
218+
if (startIndex == -1) return this
219+
220+
val substringStart = indexOf(start) + start.length
221+
val substringEnd = indexOf(end, substringStart)
222+
if (substringEnd == -1) return this
223+
224+
val stringToTransform = substring(substringStart, substringEnd)
225+
if (stringToTransform.indexOf(start) != -1) {
226+
throw CodegenException("string contains nested start delimiter")
227+
}
228+
229+
return substring(0, substringStart) + transform(stringToTransform) + end +
230+
substring(substringEnd + end.length).applyWithin(start, end, transform)
231+
}
232+
233+
private fun String.escapeHtml() =
234+
replace("&", "&amp;")
235+
.replace("<", "&lt;")
236+
.replace(">", "&gt;")
237+
238+
private fun StringBuilder.ensureLineBreak() {
239+
if (!endsWith("\n")) {
240+
appendLine()
241+
}
242+
}
243+
244+
private fun StringBuilder.ensureSectionBreak() {
245+
if (endsWith("\n\n")) return
246+
247+
if (endsWith("\n")) {
248+
appendLine()
249+
} else {
250+
append("\n\n")
251+
}
37252
}

0 commit comments

Comments
 (0)