@@ -75,6 +75,7 @@ function lexnext(state::LexerState, bytes::DenseVector{UInt8}, start::UInt32)
7575 end
7676 skipws = skipspaces(bytes, linestart)
7777 pos = skipws. stop
78+ pos > length(bytes) && return NONE_TOKEN
7879 if state. lastelement == K" "
7980 elseif state. lastelement ∈ K" <footnote_definition"
8081 return Token(K" <paragraph" , pos, pos), pos
@@ -197,6 +198,12 @@ function lexnext_object(state::LexerState, bytes::DenseVector{UInt8},
197198 end
198199 elseif chr ∈ (UInt8(' *' ), UInt8(' /' ), UInt8(' _' ), UInt8(' =' ), UInt8(' ~' ), UInt8(' +' ))
199200 lex_markup(state, bytes, pos)
201+ elseif chr == UInt8(' \\ ' )
202+ tok = lex_entity(state, bytes, pos)
203+ if tok == NONE_TOKEN
204+ tok = lex_latexfrag(state, bytes, pos)
205+ end
206+ tok
200207 else
201208 NONE_TOKEN
202209 end
524531
525532# Object lexing
526533
527- # TODO : Entities
534+ function lex_entity(:: LexerState , bytes:: DenseVector{UInt8} , pos:: UInt32 )
535+ bytes[pos] == UInt8(' \\ ' ) || return NONE_TOKEN
536+ nameend = pos + 0x1
537+ if hasprefix(bytes, nameend, " frac" )
538+ nameend = skipchars(bytes, nameend + ncodeunits(" frac" ) % UInt32, ' 1' :' 4' )
539+ nameend == pos + ncodeunits(" frac" ) % UInt32 + 0x3 || return NONE_TOKEN
540+ end
541+ while nameend <= length(bytes)
542+ chr, len = charat(bytes, nameend)
543+ isletter(chr) || break
544+ nameend += len
545+ end
546+ # Handle `_SPC` pattern
547+ if nameend == pos + 0x1 && hasprefix(bytes, nameend, " _ " )
548+ nameend += 0x1
549+ while nameend <= min(length(bytes), pos + 0x14 )
550+ bytes[nameend] == UInt8(' ' ) || break
551+ nameend += 0x1
552+ end
553+ end
554+ nameend == pos + 0x1 && return NONE_TOKEN
555+ ehash = strhash(bytes, pos + 0x1 , nameend - 0x1 )
556+ ehash ∈ ENTITY_KEYS || return NONE_TOKEN
557+ if nameend < length(bytes) && hasprefix(bytes, nameend, " {}" )
558+ nameend += 0x2
559+ elseif islineend(bytes, nameend) || ! isletter(bytes, nameend)
560+ else
561+ return NONE_TOKEN
562+ end
563+ eshort = reduce(⊻ , reinterpret(NTuple{sizeof(UInt), UInt8}, ehash))
564+ Token(settag(K" entity" , eshort), pos, nameend - 0x1 ), nameend
565+ end
566+
567+ function lex_latexfrag(:: LexerState , bytes:: DenseVector{UInt8} , pos:: UInt32 )
568+ pos < length(bytes) && bytes[pos] == UInt8(' \\ ' ) || return NONE_TOKEN
569+ nchar = bytes[pos + 0x1 ]
570+ if nchar ∉ (UInt8(' (' ), UInt8(' [' ))
571+ nameend = skipletters(bytes, pos + 0x1 )
572+ while nameend < length(bytes) && bytes[nameend] ∈ (UInt8(' [' ), UInt8(' {' ))
573+ if bytes[nameend] == UInt8(' [' )
574+ nameend = nextchar(bytes, nameend + 0x1 , (' {' , ' }' , ' [' , ' ]' , ' \n ' ))
575+ ischarat(bytes, nameend, ' ]' ) || return NONE_TOKEN
576+ nameend += 0x1
577+ elseif bytes[nameend] == UInt8(' {' )
578+ nameend = nextchar(bytes, nameend + 0x1 , (' {' , ' }' , ' \n ' ))
579+ ischarat(bytes, nameend, ' }' ) || return NONE_TOKEN
580+ nameend += 0x1
581+ end
582+ end
583+ nameend == pos + 0x1 && return NONE_TOKEN
584+ return Token(K" latex_fragment[1]" , pos, nameend - 0x1 ), nameend
585+ end
586+ echar = if nchar == UInt8(' (' ) UInt8(' )' ) else UInt8(' ]' ) end
587+ texend = pos + 0x2
588+ while true
589+ texend < length(bytes) || return NONE_TOKEN
590+ texend = nextchar(bytes, texend, (' \\ ' , ' \n ' ))
591+ texend < length(bytes) || return NONE_TOKEN
592+ if bytes[texend] == UInt8(' \n ' )
593+ bytes[texend + 0x1 ] ∈ UInt8(' *' ) && return NONE_TOKEN
594+ texend = skipspaces(bytes, texend + 0x1 ). stop
595+ islineend(bytes, texend) && return NONE_TOKEN
596+ else
597+ texend += 0x1
598+ bytes[texend- 0x1 ] == UInt8(' \\ ' ) && bytes[texend] == echar && break
599+ end
600+ end
601+ kind = if nchar == UInt8(' (' )
602+ K" latex_fragment[2]"
603+ else
604+ K" latex_fragment[3]"
605+ end
606+ Token(kind, pos, texend), texend + 0x1
607+ end
528608
529609# TODO : Export snippets
530610
@@ -788,8 +868,8 @@ julia> charat(cu, 7)
788868"""
789869function charat(bytes:: DenseVector{UInt8} , pos:: I ) where {I <: Integer }
790870 b1 = bytes[pos]
791- b1 < 0x80 && return UInt32(b1), 1 # ASCII fast-path
792- len = utf8bytes(b1)
871+ b1 < 0x80 && return UInt32(b1), one(I) # ASCII fast-path
872+ len = utf8bytes(b1) % I
793873 if len == 2 && length(bytes) >= pos + 1
794874 b2 = bytes[pos + 1 ]
795875 UInt32(b1 & 0x1F ) << 6 | b2 & 0x3f
@@ -805,35 +885,109 @@ function charat(bytes::DenseVector{UInt8}, pos::I) where {I <: Integer}
805885 UInt32(b3 & 0x3f ) << 6 | b4 & 0x3f
806886 else
807887 0x0000fffd
808- end , len % I
888+ end , len
809889end
810890
811- """
812- skipwords(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
813-
814- Skip over all word-constituent characters in `bytes` starting at `pos`.
815-
816- If `extras` is provided, then any character in `extras` is also considered,
817- where `extras` is a tuple of characters as `UInt8`s or `Char`s.
818- """
819- function skipwords(bytes:: DenseVector{UInt8} , pos:: I , extras:: NTuple{N, C} = (); limit:: I = length(bytes) % I):: I where {I <: Integer , N, C <: Union{Char, UInt8} }
891+ function skipcondition(bytes:: DenseVector{UInt8} , pos:: I , charcond:: F , asciis:: NTuple{A, Tuple{Char, Char}} , extras:: NTuple{E, Char} ; limit:: I ) where {I <: Integer , F, A, E}
820892 len, next = one(pos), pos
821893 alsoskip = map(UInt8, extras)
894+ ascii8s = map(((c1, c2),) -> (UInt8(c1), UInt8(c2)), asciis)
822895 while next <= limit
823896 b1 = bytes[next]
824897 if b1 < 0x7f
825898 len = one(pos)
826- UInt8(' a' ) <= b1 <= UInt8(' z' ) ||
827- UInt8(' A' ) <= b1 <= UInt8(' Z' ) ||
828- UInt8(' 0' ) <= b1 <= UInt8(' 9' ) ||
829- b1 ∈ alsoskip
899+ cond = false
900+ for (start, stop) in ascii8s
901+ if start <= b1 <= stop
902+ cond = true
903+ break
904+ end
905+ end
906+ cond || b1 ∈ alsoskip
830907 else
831908 chr, len = charat(bytes, next)
832- 1 <= Base . Unicode . category_code (chr) <= 4
909+ charcond (chr)
833910 end || return next
834911 pos, next = next, next + len
835912 end
836- pos
913+ next
914+ end
915+
916+ """
917+ skipwords(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
918+
919+ Skip over all word-constituent characters in `bytes` starting at `pos`.
920+
921+ If `extras` is provided, then any character in `extras` is also considered,
922+ where `extras` is a tuple of `Char`s.
923+ """
924+ function skipwords(bytes:: DenseVector{UInt8} , pos:: I , extras:: NTuple{N, Char} = (); limit:: I = length(bytes) % I):: I where {I <: Integer , N}
925+ @inline skipcondition(bytes, pos, isword, ((' a' , ' z' ), (' A' , ' Z' ), (' 0' , ' 9' )), extras; limit)
926+ end
927+
928+ """
929+ isword(chr::UInt32) -> Bool
930+ isword(bytes::DenseVector{UInt8}, pos::Integer) -> Bool
931+
932+ Return `true` if the given Unicode codepoint or the character `chr`
933+ or at position `pos` in the byte array `bytes` is a word-constituent character.
934+
935+ The character must be a member of one of the following Unicode categories:
936+ - `Ll`: Letter, Lowercase
937+ - `Lu`: Letter, Uppercase
938+ - `Lt`: Letter, Titlecase
939+ - `Lo`: Letter, Other
940+ - `Lm`: Letter, Modifier
941+ - `Mn`: Mark, Nonspacing
942+ - `Mc`: Mark, Spacing Combining
943+ - `Me`: Mark, Enclosing
944+ - `Nd`: Number, Decimal Digit
945+ - `Nl`: Number, Letter
946+ - `Pc`: Punctuation, Connector
947+ - `So`: Symbol, Other
948+ """
949+ function isword(chr:: UInt32 )
950+ code = Base. Unicode. category_code(chr)
951+ 1 <= code <= 10 || code ∈ (12 , 22 )
952+ end
953+
954+ function isword(bytes:: DenseVector{UInt8} , pos:: Integer )
955+ isword(first(charat(bytes, pos)))
956+ end
957+
958+ """
959+ skipletters(bytes::DenseVector{UInt8}, pos::Integer, extras) -> Integer
960+
961+ Skip over all letter-constituent characters in `bytes` starting at `pos`.
962+
963+ If `extras` is provided, then any character in `extras` is also considered,
964+ where `extras` is a tuple of `Char`s.
965+ """
966+ function skipletters(bytes:: DenseVector{UInt8} , pos:: I , extras:: NTuple{N, Char} = (); limit:: I = length(bytes) % I):: I where {I <: Integer , N}
967+ @inline skipcondition(bytes, pos, isletter, ((' a' , ' z' ), (' A' , ' Z' )), extras; limit)
968+ end
969+
970+ """
971+ isletter(chr::UInt32) -> Bool
972+ isletter(bytes::DenseVector{UInt8}, pos::Integer) -> Bool
973+
974+ Return `true` if the given Unicode codepoint or the character `chr` or at
975+ position `pos` in the byte array `bytes` is a letter character.
976+
977+ The character must be a member of one of the following Unicode categories:
978+ - `Ll`: Letter, Lowercase
979+ - `Lu`: Letter, Uppercase
980+ - `Lt`: Letter, Titlecase
981+ - `Lo`: Letter, Other
982+ - `Lm`: Letter, Modifier
983+ """
984+ function isletter(chr:: UInt32 )
985+ code = Base. Unicode. category_code(chr)
986+ 1 <= code <= 5
987+ end
988+
989+ function isletter(bytes:: DenseVector{UInt8} , pos:: Integer )
990+ isletter(first(charat(bytes, pos)))
837991end
838992
839993"""
@@ -1041,3 +1195,8 @@ function word2tag(bytes::DenseVector{UInt8}, start::Integer, stop::Integer)
10411195 h8 = reinterpret(NTuple{8 , UInt8}, h)
10421196 reduce(xor, h8)
10431197end
1198+
1199+ Base. @assume_effects :total function strhash(bytes:: DenseVector{UInt8} , start:: Integer , stop:: Integer )
1200+ ccall(Base. memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32),
1201+ pointer(bytes, start), (stop - start + 0x1 ), Base. memhash_seed % UInt32) + Base. memhash_seed
1202+ end
0 commit comments