Implement LaTeX environment lexing

tecosaur · tecosaur · commit 2d8dccbae7a0 · 2025-03-16T15:06:24.000+08:00
diff --git a/README.org b/README.org
@@ -47,7 +47,7 @@ TODO
 | PropertyDrawer      |      | X   |       |     |      |      |
 | Table               |      | X   |       |     |      |      |
 |---------------------+------+-----+-------+-----+------+------|
-| BabelCall           |      |     |       |     |      |      |
+| BabelCall           |      | -   |       |     |      |      |
 | Block               |      | X   |       |     |      |      |
 | Clock               |      | X   |       |     |      |      |
 | DiarySexp           |      | X   |       |     |      |      |
@@ -57,7 +57,7 @@ TODO
 | HorizontalRule      |      | X   |       |     |      |      |
 | Keyword             |      | X   |       |     |      |      |
 | Affiliated Keywords |      | -   |       |     |      |      |
-| LaTeX Environment   |      |     |       |     |      |      |
+| LaTeX Environment   |      | X   |       |     |      |      |
 | NodeProperty        |      | X   |       |     |      |      |
 | Paragraph           |      |     |       |     |      |      |
 | TableRow            |      | X   |       |     |      |      |
diff --git a/src/lexer.jl b/src/lexer.jl
@@ -117,6 +117,8 @@ function lexnext(state::LexerState, bytes::DenseVector{UInt8}, start::UInt32)::T
             lex_comment(state, bytes, pos)
         elseif chr == UInt8('-') && ischarat(bytes, pos + 0x1, '-')
             lex_hrule(state, bytes, pos)
+        elseif chr == UInt8('\\') && hasprefix(bytes, pos + 0x1, "begin{")
+            lex_latexenv(state, bytes, pos)
         elseif K"heading" ∈ state.lastelement
             lex_planning(state, bytes, pos)
         else
@@ -458,7 +460,33 @@ function lex_hrule(::LexerState, bytes::DenseVector{UInt8}, pos::UInt32)
     Token(K"hrule", pos, rend - 0x1), lend
 end
 
-# TODO: LaTeX environments
+function lex_latexenv(::LexerState, bytes::DenseVector{UInt8}, start::UInt32)
+    hasprefix(bytes, start, "\\begin{") || return NONE_TOKEN
+    namestart = start + ncodeunits("\\begin{") % UInt32
+    nameend = skipcharsets(bytes, namestart, ('a':'z', 'A':'Z', '0':'9', '*'))
+    nameend < length(bytes) && bytes[nameend] == UInt8('}') || return NONE_TOKEN
+    namelen = nameend - namestart
+    pos = start
+    while pos <= length(bytes)
+        pos = lineend(bytes, pos) + 0x1
+        pos = skipspaces(bytes, pos).stop
+        hasprefix(bytes, pos, "\\end{") || continue
+        pos += ncodeunits("\\end{") % UInt32
+        pos + namelen < length(bytes) || return NONE_TOKEN
+        namematch = true
+        for offset in 0:namelen-0x1
+            if bytes[namestart + offset] != bytes[pos + offset]
+                namematch = false
+                break
+            end
+        end
+        namematch && bytes[pos + namelen] == UInt8('}') || continue
+        islineend(bytes, skipspaces(bytes, pos + namelen + 0x1).stop) ||
+            return NONE_TOKEN
+        return Token(K"latex_environment", start, pos + namelen), lineend(bytes, pos)
+    end
+    NONE_TOKEN
+end
 
 # TODO: Paragraphs
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -288,6 +288,36 @@ end
         @test collect(Lexer("-----   ")) ==
             [Token(K"hrule", 1, 5)]
     end
+    @testset "LaTeX envs" begin
+        @test collect(Lexer("""
+            \\begin{env}
+            stuff
+            \\end{env}
+            """)) ==
+                [Token(K"latex_environment", 1, 27)]
+        @test collect(Lexer("""
+            \\begin{env}
+            stuff
+            \\end{env}fluff
+            """)) ==
+                Token[]
+        @test collect(Lexer("""
+            \\begin{equation*}
+            \\begin{align}
+            a &= b \\\\
+            c &= d
+            \\end{align}
+            \\end{equation*}
+
+            foo bar
+
+            \\begin{equation*}
+            x^2 + y^2 = z^2
+            \\end{equation*}
+            """)) ==
+                [Token(K"latex_environment", 1, 76)
+                 Token(K"latex_environment", 88, 136)]
+    end
     @testset "Type inference" begin
         @testset "Utilities" begin
             bytes, pos = codeunits("abc"), UInt32(1)
@@ -326,6 +356,7 @@ end
             @inferred Tuple{Token, UInt32} Org.lex_comment(lstate, bytes, pos)
             @inferred Tuple{Token, UInt32} Org.lex_fixedwidth(lstate, bytes, pos)
             @inferred Tuple{Token, UInt32} Org.lex_hrule(lstate, bytes, pos)
+            @inferred Tuple{Token, UInt32} Org.lex_latexenv(lstate, bytes, pos)
         end
     end
     @testset "Unhandled errors" begin
@@ -366,6 +397,7 @@ end
             @test_call Org.lex_comment(lstate, bytes, pos)
             @test_call Org.lex_fixedwidth(lstate, bytes, pos)
             @test_call Org.lex_hrule(lstate, bytes, pos)
+            @test_call Org.lex_latexenv(lstate, bytes, pos)
         end
         @testset "Iteration" begin
             @test_call iterate(Lexer("abc"), LexerState())
@@ -409,6 +441,7 @@ end
             @test_opt Org.lex_comment(lstate, bytes, pos)
             @test_opt Org.lex_fixedwidth(lstate, bytes, pos)
             @test_opt Org.lex_hrule(lstate, bytes, pos)
+            @test_opt Org.lex_latexenv(lstate, bytes, pos)
         end
         @testset "Iteration" begin
             @test_opt iterate(Lexer("abc"), LexerState())