Skip to content

Commit a43effa

Browse files
committed
handle RAW HTML
Signed-off-by: George Lemon <georgelemon@protonmail.com>
1 parent 68e51a5 commit a43effa

File tree

2 files changed

+72
-14
lines changed

2 files changed

+72
-14
lines changed

src/marvdown/lexer.nim

Lines changed: 59 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,14 @@ proc advance(lex: var MarkdownLexer) =
7171
inc lex.line
7272
lex.col = 0
7373
lex.wsno = 0
74-
elif lex.current in {' ', '\t', '\r'}:
75-
inc lex.wsno
74+
elif lex.current in {' ', '\t'}:
75+
# Only count indentation (wsno) when whitespace is at start of line.
76+
if lex.col == 0:
77+
inc lex.wsno
78+
inc lex.col
79+
elif lex.current == '\r':
80+
# Treat CR similarly to other non-leading whitespace; do not
81+
# increment wsno for it.
7682
inc lex.col
7783
else:
7884
lex.wsno = 0
@@ -138,8 +144,8 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
138144
# Remove local wsno, use lex.wsno
139145
# Skip whitespace and newlines before token
140146
while true:
141-
while lex.current in {' ', '\t', '\r'}:
142-
lex.advance()
147+
# Only normalize line endings and consume newlines; leave spaces/tabs
148+
# so that they can be emitted as text tokens (preserve inline spaces).
143149
if lex.current == '\n':
144150
lex.col = 0
145151
lex.advance()
@@ -432,36 +438,79 @@ proc nextToken*(lex: var MarkdownLexer): MarkdownTokenTuple =
432438
return newTokenTuple(lex, mtkEmphasis, wsno=lex.wsno)
433439
of ' ':
434440
# Line break (two or more spaces at end of line)
441+
# Also accept tabs as whitespace that should be emitted as text tokens.
435442
if lex.peek() == ' ' and (lex.peek(2) == '\n' or lex.peek(2) == '\r'):
436443
lex.advance(); lex.advance();
437444
if lex.current in {'\n', '\r'}:
438445
lex.advance()
439-
return newTokenTuple(lex, mtkLineBreak, wsno=lex.wsno)
446+
return newTokenTuple(lex, mtkLineBreak, wsno=lex.wsno)
440447
else:
441448
var text = " "
442449
lex.advance()
443450
return newTokenTuple(lex, mtkText, text, wsno=lex.wsno)
451+
of '\t':
452+
# Treat tabs as text tokens similar to spaces.
453+
var text = "\t"
454+
lex.advance()
455+
return newTokenTuple(lex, mtkText, text, wsno=lex.wsno)
444456
of '<':
445-
# Raw HTML
457+
# Raw HTML block: consume until matching closing tag (handles nesting)
446458
lex.strbuf.setLen(0)
447459
var tag: string
448-
var stopTag = false
460+
var stopTagName = false
461+
# Parse opening tag and get tag name
462+
let tagStart = lex.pos
449463
while true:
450464
case lex.current
451465
of '>', '\0': break
452466
of ' ':
453-
stopTag = true
467+
stopTagName = true
454468
lex.strbuf.add(lex.current)
455469
of 'a'..'z', 'A'..'Z', '0'..'9', '_', '-':
456470
lex.strbuf.add(lex.current)
457-
if not stopTag: tag.add(lex.current)
471+
if not stopTagName:
472+
tag.add(lex.current)
458473
else:
459474
lex.strbuf.add(lex.current)
460475
lex.advance()
461476
if lex.current == '>':
462477
lex.strbuf.add(lex.current)
463478
lex.advance()
464-
return newTokenTuple(lex, mtkHtml, lex.strbuf, wsno=lex.wsno, attrs=some(@[tag]))
479+
# now consume until outermost closing tag
480+
# TODO test for self-closing tags
481+
var htmlContent = lex.strbuf
482+
var depth = 1
483+
while depth > 0 and lex.current != '\0':
484+
if lex.current == '<':
485+
if lex.peek() == '/':
486+
# Possible closing tag
487+
var closeTag = ""
488+
var tempPos = lex.pos + 2
489+
while tempPos < lex.input.len and lex.input[tempPos] in {'a'..'z', 'A'..'Z', '0'..'9', '_', '-'}:
490+
closeTag.add(lex.input[tempPos])
491+
inc tempPos
492+
if closeTag == tag:
493+
depth -= 1
494+
# Add chars to htmlContent until '>'
495+
while lex.current != '>' and lex.current != '\0':
496+
htmlContent.add(lex.current)
497+
lex.advance()
498+
if lex.current == '>':
499+
htmlContent.add(lex.current)
500+
lex.advance()
501+
continue
502+
else:
503+
# Possible nested opening tag
504+
var openTag = ""
505+
var tempPos = lex.pos + 1
506+
while tempPos < lex.input.len and lex.input[tempPos] in {'a'..'z', 'A'..'Z', '0'..'9', '_', '-'}:
507+
openTag.add(lex.input[tempPos])
508+
inc tempPos
509+
if openTag == tag:
510+
depth += 1
511+
htmlContent.add(lex.current)
512+
lex.advance()
513+
return newTokenTuple(lex, mtkHtml, htmlContent, wsno=lex.wsno, attrs=some(@[tag]))
465514
of '|':
466515
# Table row
467516
lex.strbuf.setLen(0)

src/marvdown/parser.nim

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,13 @@ proc slugify(input: string, lowercase = true, sep = "-"): string =
8888
if result.len > 0 and result[^1] != '-':
8989
result.add(sep)
9090
of Letters:
91-
result.add if lowercase: c.toLowerAscii else: c
91+
result.add if lowercase:
92+
c.toLowerAscii else: c
9293
of Digits:
9394
result.add c
95+
of '-':
96+
if result.len > 0 and result[^1] != '-':
97+
result.add('-')
9498
else: discard
9599

96100
proc parseImage(md: var Markdown): MarkdownNode =
@@ -370,8 +374,10 @@ proc parseInline(md: var Markdown, text: string): seq[MarkdownNode] =
370374
line: strongLine,
371375
wsno: strongWsno
372376
)
377+
node.wsno = curr.wsno
373378
if curr.kind == mtkStrong:
374379
curr = lex.nextToken()
380+
else: discard # todo handle unclosed strong
375381
else:
376382
node = MarkdownNode(
377383
kind: mdkText,
@@ -578,10 +584,12 @@ proc parseMarkdown(md: var Markdown, currentParagraph: var MarkdownNode) =
578584
md.ast.add(headingNode)
579585
md.advance()
580586
of mtkHtml:
587+
closeCurrentParagraph()
581588
let tag = curr.attrs.get()[0]
582589
let tagType = htmlparser.htmlTag(tag)
583590
if md.opts.allowed.len > 0:
584591
if not md.opts.allowed.contains(tagType):
592+
# TODO handle disallowed tags (e.g., escape or ignore)
585593
withCurrentParagraph do:
586594
let textValue =
587595
curr.token.multiReplace(("<", "&lt;"), (">", "&gt;"))
@@ -764,16 +772,17 @@ proc renderNode(md: var Markdown, node: MarkdownNode): string =
764772

765773
if md.opts.enableAnchors:
766774
# if anchors are enabled, generate unique anchors
767-
var anchor = slugify(parseHtml(innerContent).innerText)
775+
let title = parseHtml(innerContent).innerText
776+
var anchor = slugify(title)
768777
if md.selectorCounter.contains(anchor):
769778
# make unique anchors - e.g., "heading-2", "heading-3", etc.
770779
let count = md.selectorCounter[anchor] + 1
771780
md.selectorCounter[anchor] = count
772781
anchor.add("-" & $count)
773-
md.selectors[anchor] = anchor
782+
md.selectors[anchor] = title
774783
else: # first occurrence
775784
md.selectorCounter[anchor] = 1
776-
md.selectors[anchor] = anchor
785+
md.selectors[anchor] = title
777786
let anchorlink =
778787
a(href="#" & anchor, `class`="anchor-link",
779788
md.opts.anchorIcon)

0 commit comments

Comments
 (0)