Skip to content

Commit f1b5575

Browse files
committed
Implement markup lexing
1 parent 692ef92 commit f1b5575

File tree

3 files changed

+104
-8
lines changed

3 files changed

+104
-8
lines changed

README.org

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,5 @@ TODO
8484
| Superscript | | | | | | |
8585
| TableCell | | X | | | | |
8686
| Timestamp | | | | | | |
87+
| TextMarkup | | X | | | | |
8788
| TextPlain | | | | | | |
88-
| TextMarkup | | | | | | |

src/lexer.jl

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ function Base.iterate(lex::Lexer, state::LexerState)
3333
while position <= length(lex.input)
3434
token, position = @inline lexnext(state, lex.input, position)
3535
if token.kind in K"elements"
36+
ctx = ctx & ~!K"objects"
3637
lastelement = token.kind
3738
end
3839
if token.kind == K"plaintext"
@@ -69,6 +70,9 @@ const NONE_TOKEN = Token(K"", 0, 0), UInt32(0)
6970

7071
function lexnext(state::LexerState, bytes::DenseVector{UInt8}, start::UInt32)
7172
linestart, newlines = @inline skipnewlines(bytes, start)
73+
if start == 1 && state.lastelement != K"<paragraph"
74+
newlines = 1
75+
end
7276
skipws = skipspaces(bytes, linestart)
7377
pos = skipws.stop
7478
if state.lastelement == K""
@@ -95,7 +99,7 @@ function lexnext(state::LexerState, bytes::DenseVector{UInt8}, start::UInt32)
9599
lexnext_object(state, bytes, start, linestart, pos, chr)
96100
end
97101
if next != NONE_TOKEN
98-
if K"paragraph" state.ctx
102+
if newlines != 0 && K"paragraph" state.ctx
99103
Token(K">paragraph", start - 0x1, start - 0x1), start
100104
else
101105
next
@@ -191,6 +195,8 @@ function lexnext_object(state::LexerState, bytes::DenseVector{UInt8},
191195
end
192196
Token(K"<table_cell", pos, pos), pos
193197
end
198+
elseif chr (UInt8('*'), UInt8('/'), UInt8('_'), UInt8('='), UInt8('~'), UInt8('+'))
199+
lex_markup(state, bytes, pos)
194200
else
195201
NONE_TOKEN
196202
end
@@ -548,7 +554,56 @@ end
548554

549555
# TODO: Timestamps
550556

551-
# TODO: Text markup
557+
function markupkind(delim::UInt8)
558+
if delim == UInt8('*')
559+
K"bold"
560+
elseif delim == UInt8('/')
561+
K"italic"
562+
elseif delim == UInt8('_')
563+
K"underline"
564+
elseif delim == UInt8('=')
565+
K"verbatim"
566+
elseif delim == UInt8('~')
567+
K"code"
568+
elseif delim == UInt8('+')
569+
K"strikethrough"
570+
else
571+
K""
572+
end
573+
end
574+
575+
const MARKUP_PRE_CHARS = let chars = "-({'\""
576+
Tuple(map(UInt8, collect(chars)))
577+
end
578+
579+
const MARKUP_POST_CHARS = let chars = "-.,;:!?')}[\"\\"
580+
Tuple(map(UInt8, collect(chars)))
581+
end
582+
583+
function lex_markup(state::LexerState, bytes::DenseVector{UInt8}, pos::UInt32)
584+
posprev = utf8prev(bytes, pos)
585+
posnext = utf8next(bytes, pos)
586+
side = if (posnext > length(bytes) || !iswhitespace(bytes, posnext)) &&
587+
(islinestart(bytes, posprev) || iswhitespace(bytes, posprev) || bytes[posprev] MARKUP_PRE_CHARS ||
588+
(bytes[posprev] != bytes[pos] && markupkind(bytes[posprev]) & state.restriction != K""))
589+
K"<"
590+
elseif (posprev == 0 || !iswhitespace(bytes, posprev)) &&
591+
(islineend(bytes, posnext) || iswhitespace(bytes, posnext) || bytes[posnext] MARKUP_POST_CHARS ||
592+
(bytes[posnext] != bytes[pos] && markupkind(bytes[posnext]) & state.ctx != K""))
593+
K">"
594+
else
595+
return NONE_TOKEN
596+
end
597+
kind = markupkind(bytes[pos])
598+
if kind == K""
599+
false
600+
elseif side == K"<"
601+
kind state.restriction
602+
else
603+
kind state.ctx
604+
end || return NONE_TOKEN
605+
Token(kind | side, pos, pos), posnext
606+
end
552607

553608

554609
# Utility functions
@@ -603,7 +658,7 @@ const PLAIN_SKIP_TABLE = let canskip = zeros(Bool, 255)
603658
for c in UInt8('0'):UInt8('9')
604659
canskip[c] = true
605660
end
606-
for c in "!\"&'(),.;?]}"
661+
for c in " !\"&'(),.;?]}"
607662
canskip[UInt8(c)] = true
608663
end
609664
Tuple(canskip)
@@ -687,7 +742,7 @@ function skipplain(bytes::DenseVector{UInt8}, start::I, multiline::Bool = false;
687742
end
688743
else
689744
clen = utf8bytes(chr) % I
690-
clen == 1 && pos > start && return pos - 0x1
745+
clen == 1 && pos > start && return pos
691746
pos += clen
692747
end
693748
end
@@ -908,6 +963,10 @@ function lineend(bytes::DenseVector{UInt8}, pos::I; limit::I = length(bytes) % I
908963
limit + 0x1
909964
end
910965

966+
function islinestart(bytes::DenseVector{UInt8}, pos::Integer)
967+
pos < 1 || pos <= length(bytes) && ischarat(bytes, pos, '\n')
968+
end
969+
911970
function hasprefix(bytes::DenseVector{UInt8}, start::Integer, pattern::String; limit::Integer = length(bytes) % typeof(start))
912971
limit >= start + ncodeunits(pattern) - 1 || return false
913972
for (i, c) in enumerate(codeunits(pattern))
@@ -943,7 +1002,7 @@ function untilwhitespace(bytes::DenseVector{UInt8}, pos::I; limit::I = length(by
9431002
end
9441003

9451004
function skipnewlines(bytes::DenseVector{UInt8}, pos::I; limit::I = length(bytes) % I)::Tuple{I, Int} where {I <: Integer}
946-
newlines = Int(pos == 1)
1005+
newlines = 0
9471006
while true
9481007
if bytes[pos] == UInt8('\n')
9491008
pos += 0x1

test/runtests.jl

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ end
126126
""")) ==
127127
[Token(K"<drawer", 1, 8),
128128
Token(K"<paragraph", 10, 10),
129-
Token(K">paragraph", 16, 16),
130-
Token(K"<paragraph", 18, 18),
131129
Token(K">paragraph", 34, 34),
132130
Token(K">drawer", 36, 40)]
133131
end
@@ -392,6 +390,42 @@ end
392390
Token(K">paragraph", 85, 85),
393391
Token(K"latex_environment", 88, 136)]
394392
end
393+
@testset "Markup" begin
394+
@test collect(Lexer("*bold*")) ==
395+
[Token(K"<paragraph", 1, 1),
396+
Token(K"<bold", 1, 1),
397+
Token(K">bold", 6, 6)]
398+
@test collect(Lexer("*bold* /italic/ _underline_ ~code~ =verbatim= +strikethrough+")) ==
399+
[Token(K"<paragraph", 1, 1),
400+
Token(K"<bold", 1, 1),
401+
Token(K">bold", 6, 6),
402+
Token(K"<italic", 8, 8),
403+
Token(K">italic", 15, 15),
404+
Token(K"<underline", 17, 17),
405+
Token(K">underline", 27, 27),
406+
Token(K"<code", 29, 29),
407+
Token(K">code", 34, 34),
408+
Token(K"<verbatim", 36, 36),
409+
Token(K">verbatim", 45, 45),
410+
Token(K"<strikethrough", 47, 47),
411+
Token(K">strikethrough", 61, 61)]
412+
@test collect(Lexer("*/italic/*")) ==
413+
[Token(K"<paragraph", 1, 1),
414+
Token(K"<bold", 1, 1),
415+
Token(K"<italic", 2, 2),
416+
Token(K">italic", 9, 9),
417+
Token(K"<bold", 10, 10)]
418+
@test collect(Lexer("=*/italic/*=")) ==
419+
[Token(K"<paragraph", 1, 1),
420+
Token(K"<verbatim", 1, 1),
421+
Token(K">verbatim", 12, 12)]
422+
@test collect(Lexer("*hey =and /not italic/ verbatim= there* stuff")) ==
423+
[Token(K"<paragraph", 1, 1),
424+
Token(K"<bold", 1, 1),
425+
Token(K"<verbatim", 6, 6),
426+
Token(K">verbatim", 32, 32),
427+
Token(K">bold", 39, 39)]
428+
end
395429
@testset "Type inference" begin
396430
@testset "Utilities" begin
397431
bytes, pos = codeunits("abc"), UInt32(1)
@@ -430,6 +464,7 @@ end
430464
@inferred Tuple{Token, UInt32} Org.lex_fixedwidth(lstate, bytes, pos)
431465
@inferred Tuple{Token, UInt32} Org.lex_hrule(lstate, bytes, pos)
432466
@inferred Tuple{Token, UInt32} Org.lex_latexenv(lstate, bytes, pos)
467+
@inferred Tuple{Token, UInt32} Org.lex_markup(lstate, bytes, pos)
433468
end
434469
end
435470
@testset "Unhandled errors" begin
@@ -470,6 +505,7 @@ end
470505
@test_call Org.lex_fixedwidth(lstate, bytes, pos)
471506
@test_call Org.lex_hrule(lstate, bytes, pos)
472507
@test_call Org.lex_latexenv(lstate, bytes, pos)
508+
@test_call Org.lex_markup(lstate, bytes, pos)
473509
end
474510
@testset "Iteration" begin
475511
@test_call iterate(Lexer("abc"), LexerState())
@@ -513,6 +549,7 @@ end
513549
@test_opt Org.lex_fixedwidth(lstate, bytes, pos)
514550
@test_opt Org.lex_hrule(lstate, bytes, pos)
515551
@test_opt Org.lex_latexenv(lstate, bytes, pos)
552+
@test_opt Org.lex_markup(lstate, bytes, pos)
516553
end
517554
@testset "Iteration" begin
518555
@test_opt iterate(Lexer("abc"), LexerState())

0 commit comments

Comments
 (0)