Skip to content

Commit 61ab8ea

Browse files
committed
Implement entity and LaTeX fragment lexing
1 parent f1b5575 commit 61ab8ea

File tree

5 files changed

+730
-21
lines changed

5 files changed

+730
-21
lines changed

README.org

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ TODO
6464
| TableHRule | | X | | | | |
6565
| BlankLine | | | | | | |
6666
|---------------------+------+-----+-------+-----+------+------|
67-
| OrgEntity | | | | | | |
68-
| LaTeX Fragment | | | | | | |
67+
| OrgEntity | | X | | | | |
68+
| LaTeX Fragment | | X | | | | |
6969
| ExportSnippet | | | | | | |
7070
| FootnoteReference | | | | | | |
7171
| InlineBabelCall | | | | | | |

src/Org.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
module Org
55

66
include("syntax.jl")
7+
include("variables.jl")
78
include("lexer.jl")
89

910
end

src/lexer.jl

Lines changed: 178 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ function lexnext(state::LexerState, bytes::DenseVector{UInt8}, start::UInt32)
7575
end
7676
skipws = skipspaces(bytes, linestart)
7777
pos = skipws.stop
78+
pos > length(bytes) && return NONE_TOKEN
7879
if state.lastelement == K""
7980
elseif state.lastelement K"<footnote_definition"
8081
return Token(K"<paragraph", pos, pos), pos
@@ -197,6 +198,12 @@ function lexnext_object(state::LexerState, bytes::DenseVector{UInt8},
197198
end
198199
elseif chr (UInt8('*'), UInt8('/'), UInt8('_'), UInt8('='), UInt8('~'), UInt8('+'))
199200
lex_markup(state, bytes, pos)
201+
elseif chr == UInt8('\\')
202+
tok = lex_entity(state, bytes, pos)
203+
if tok == NONE_TOKEN
204+
tok = lex_latexfrag(state, bytes, pos)
205+
end
206+
tok
200207
else
201208
NONE_TOKEN
202209
end
@@ -524,7 +531,80 @@ end
524531

525532
# Object lexing
526533

527-
# TODO: Entities
534+
function lex_entity(::LexerState, bytes::DenseVector{UInt8}, pos::UInt32)
535+
bytes[pos] == UInt8('\\') || return NONE_TOKEN
536+
nameend = pos + 0x1
537+
if hasprefix(bytes, nameend, "frac")
538+
nameend = skipchars(bytes, nameend + ncodeunits("frac") % UInt32, '1':'4')
539+
nameend == pos + ncodeunits("frac") % UInt32 + 0x3 || return NONE_TOKEN
540+
end
541+
while nameend <= length(bytes)
542+
chr, len = charat(bytes, nameend)
543+
isletter(chr) || break
544+
nameend += len
545+
end
546+
# Handle `_SPC` pattern
547+
if nameend == pos + 0x1 && hasprefix(bytes, nameend, "_ ")
548+
nameend += 0x1
549+
while nameend <= min(length(bytes), pos + 0x14)
550+
bytes[nameend] == UInt8(' ') || break
551+
nameend += 0x1
552+
end
553+
end
554+
nameend == pos + 0x1 && return NONE_TOKEN
555+
ehash = strhash(bytes, pos + 0x1, nameend - 0x1)
556+
ehash ENTITY_KEYS || return NONE_TOKEN
557+
if nameend < length(bytes) && hasprefix(bytes, nameend, "{}")
558+
nameend += 0x2
559+
elseif islineend(bytes, nameend) || !isletter(bytes, nameend)
560+
else
561+
return NONE_TOKEN
562+
end
563+
eshort = reduce(, reinterpret(NTuple{sizeof(UInt), UInt8}, ehash))
564+
Token(settag(K"entity", eshort), pos, nameend - 0x1), nameend
565+
end
566+
567+
function lex_latexfrag(::LexerState, bytes::DenseVector{UInt8}, pos::UInt32)
568+
pos < length(bytes) && bytes[pos] == UInt8('\\') || return NONE_TOKEN
569+
nchar = bytes[pos + 0x1]
570+
if nchar (UInt8('('), UInt8('['))
571+
nameend = skipletters(bytes, pos + 0x1)
572+
while nameend < length(bytes) && bytes[nameend] (UInt8('['), UInt8('{'))
573+
if bytes[nameend] == UInt8('[')
574+
nameend = nextchar(bytes, nameend + 0x1, ('{', '}', '[', ']', '\n'))
575+
ischarat(bytes, nameend, ']') || return NONE_TOKEN
576+
nameend += 0x1
577+
elseif bytes[nameend] == UInt8('{')
578+
nameend = nextchar(bytes, nameend + 0x1, ('{', '}', '\n'))
579+
ischarat(bytes, nameend, '}') || return NONE_TOKEN
580+
nameend += 0x1
581+
end
582+
end
583+
nameend == pos + 0x1 && return NONE_TOKEN
584+
return Token(K"latex_fragment[1]", pos, nameend - 0x1), nameend
585+
end
586+
echar = if nchar == UInt8('(') UInt8(')') else UInt8(']') end
587+
texend = pos + 0x2
588+
while true
589+
texend < length(bytes) || return NONE_TOKEN
590+
texend = nextchar(bytes, texend, ('\\', '\n'))
591+
texend < length(bytes) || return NONE_TOKEN
592+
if bytes[texend] == UInt8('\n')
593+
bytes[texend + 0x1] UInt8('*') && return NONE_TOKEN
594+
texend = skipspaces(bytes, texend + 0x1).stop
595+
islineend(bytes, texend) && return NONE_TOKEN
596+
else
597+
texend += 0x1
598+
bytes[texend-0x1] == UInt8('\\') && bytes[texend] == echar && break
599+
end
600+
end
601+
kind = if nchar == UInt8('(')
602+
K"latex_fragment[2]"
603+
else
604+
K"latex_fragment[3]"
605+
end
606+
Token(kind, pos, texend), texend + 0x1
607+
end
528608

529609
# TODO: Export snippets
530610

@@ -788,8 +868,8 @@ julia> charat(cu, 7)
788868
"""
789869
function charat(bytes::DenseVector{UInt8}, pos::I) where {I <: Integer}
790870
b1 = bytes[pos]
791-
b1 < 0x80 && return UInt32(b1), 1 # ASCII fast-path
792-
len = utf8bytes(b1)
871+
b1 < 0x80 && return UInt32(b1), one(I) # ASCII fast-path
872+
len = utf8bytes(b1) % I
793873
if len == 2 && length(bytes) >= pos + 1
794874
b2 = bytes[pos + 1]
795875
UInt32(b1 & 0x1F) << 6 | b2 & 0x3f
@@ -805,35 +885,109 @@ function charat(bytes::DenseVector{UInt8}, pos::I) where {I <: Integer}
805885
UInt32(b3 & 0x3f) << 6 | b4 & 0x3f
806886
else
807887
0x0000fffd
808-
end, len % I
888+
end, len
809889
end
810890

811-
"""
812-
skipwords(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
813-
814-
Skip over all word-constituent characters in `bytes` starting at `pos`.
815-
816-
If `extras` is provided, then any character in `extras` is also considered,
817-
where `extras` is a tuple of characters as `UInt8`s or `Char`s.
818-
"""
819-
function skipwords(bytes::DenseVector{UInt8}, pos::I, extras::NTuple{N, C} = (); limit::I = length(bytes) % I)::I where {I <: Integer, N, C <: Union{Char, UInt8}}
891+
function skipcondition(bytes::DenseVector{UInt8}, pos::I, charcond::F, asciis::NTuple{A, Tuple{Char, Char}}, extras::NTuple{E, Char}; limit::I) where {I <: Integer, F, A, E}
820892
len, next = one(pos), pos
821893
alsoskip = map(UInt8, extras)
894+
ascii8s = map(((c1, c2),) -> (UInt8(c1), UInt8(c2)), asciis)
822895
while next <= limit
823896
b1 = bytes[next]
824897
if b1 < 0x7f
825898
len = one(pos)
826-
UInt8('a') <= b1 <= UInt8('z') ||
827-
UInt8('A') <= b1 <= UInt8('Z') ||
828-
UInt8('0') <= b1 <= UInt8('9') ||
829-
b1 alsoskip
899+
cond = false
900+
for (start, stop) in ascii8s
901+
if start <= b1 <= stop
902+
cond = true
903+
break
904+
end
905+
end
906+
cond || b1 alsoskip
830907
else
831908
chr, len = charat(bytes, next)
832-
1 <= Base.Unicode.category_code(chr) <= 4
909+
charcond(chr)
833910
end || return next
834911
pos, next = next, next + len
835912
end
836-
pos
913+
next
914+
end
915+
916+
"""
917+
skipwords(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
918+
919+
Skip over all word-constituent characters in `bytes` starting at `pos`.
920+
921+
If `extras` is provided, then any character in `extras` is also considered,
922+
where `extras` is a tuple of `Char`s.
923+
"""
924+
function skipwords(bytes::DenseVector{UInt8}, pos::I, extras::NTuple{N, Char} = (); limit::I = length(bytes) % I)::I where {I <: Integer, N}
925+
@inline skipcondition(bytes, pos, isword, (('a', 'z'), ('A', 'Z'), ('0', '9')), extras; limit)
926+
end
927+
928+
"""
929+
isword(chr::UInt32) -> Bool
930+
isword(bytes::DenseVector{UInt8}, pos::Integer) -> Bool
931+
932+
Return `true` if the given Unicode codepoint or the character `chr`
933+
or at position `pos` in the byte array `bytes` is a word-constituent character.
934+
935+
The character must be a member of one of the following Unicode categories:
936+
- `Ll`: Letter, Lowercase
937+
- `Lu`: Letter, Uppercase
938+
- `Lt`: Letter, Titlecase
939+
- `Lo`: Letter, Other
940+
- `Lm`: Letter, Modifier
941+
- `Mn`: Mark, Nonspacing
942+
- `Mc`: Mark, Spacing Combining
943+
- `Me`: Mark, Enclosing
944+
- `Nd`: Number, Decimal Digit
945+
- `Nl`: Number, Letter
946+
- `Pc`: Punctuation, Connector
947+
- `So`: Symbol, Other
948+
"""
949+
function isword(chr::UInt32)
950+
code = Base.Unicode.category_code(chr)
951+
1 <= code <= 10 || code (12, 22)
952+
end
953+
954+
function isword(bytes::DenseVector{UInt8}, pos::Integer)
955+
isword(first(charat(bytes, pos)))
956+
end
957+
958+
"""
959+
skipletters(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
960+
961+
Skip over all letter-constituent characters in `bytes` starting at `pos`.
962+
963+
If `extras` is provided, then any character in `extras` is also considered,
964+
where `extras` is a tuple of `Char`s.
965+
"""
966+
function skipletters(bytes::DenseVector{UInt8}, pos::I, extras::NTuple{N, Char} = (); limit::I = length(bytes) % I)::I where {I <: Integer, N}
967+
@inline skipcondition(bytes, pos, isletter, (('a', 'z'), ('A', 'Z')), extras; limit)
968+
end
969+
970+
"""
971+
isletter(chr::UInt32) -> Bool
972+
isletter(bytes::DenseVector{UInt8}, pos::Integer) -> Bool
973+
974+
Return `true` if the given Unicode codepoint or the character `chr` or at
975+
position `pos` in the byte array `bytes` is a letter character.
976+
977+
The character must be a member of one of the following Unicode categories:
978+
- `Ll`: Letter, Lowercase
979+
- `Lu`: Letter, Uppercase
980+
- `Lt`: Letter, Titlecase
981+
- `Lo`: Letter, Other
982+
- `Lm`: Letter, Modifier
983+
"""
984+
function isletter(chr::UInt32)
985+
code = Base.Unicode.category_code(chr)
986+
1 <= code <= 5
987+
end
988+
989+
function isletter(bytes::DenseVector{UInt8}, pos::Integer)
990+
isletter(first(charat(bytes, pos)))
837991
end
838992

839993
"""
@@ -1041,3 +1195,8 @@ function word2tag(bytes::DenseVector{UInt8}, start::Integer, stop::Integer)
10411195
h8 = reinterpret(NTuple{8, UInt8}, h)
10421196
reduce(xor, h8)
10431197
end
1198+
1199+
Base.@assume_effects :total function strhash(bytes::DenseVector{UInt8}, start::Integer, stop::Integer)
1200+
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32),
1201+
pointer(bytes, start), (stop - start + 0x1), Base.memhash_seed % UInt32) + Base.memhash_seed
1202+
end

0 commit comments

Comments
 (0)