Skip to content

Commit 3027f17

Browse files
c42fpfitzseb
andauthored
Treat UTF-8 BOM as whitespace (#26)
Co-authored-by: Sebastian Pfitzner <[email protected]>
1 parent 98bd80c commit 3027f17

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

Tokenize/src/lexer.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ export tokenize
1616
@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
1717
@inline isbinary(c::Char) = c == '0' || c == '1'
1818
@inline isoctal(c::Char) = '0' c '7'
19-
@inline iswhitespace(c::Char) = Base.isspace(c)
19+
@inline iswhitespace(c::Char) = Base.isspace(c) || c === '\ufeff'
2020

2121
struct StringState
2222
triplestr::Bool

Tokenize/test/lexer.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,3 +868,16 @@ end
868868
check_kw_hashes(String([cs...]) for cs in Iterators.product(['a':'z' for _ in 1:len]...))
869869
end
870870
end
871+
872+
873+
@testset "UTF-8 BOM" begin
874+
@test Tokenize.Tokens.kind.(collect(tokenize("\ufeff[1\ufeff2]"))) == [
875+
Tokens.WHITESPACE,
876+
Tokens.LSQUARE,
877+
Tokens.INTEGER,
878+
Tokens.WHITESPACE,
879+
Tokens.INTEGER,
880+
Tokens.RSQUARE,
881+
Tokens.ENDMARKER
882+
]
883+
end

0 commit comments

Comments
 (0)