Skip to content

Commit 6022f1c

Browse files
jonathaningramericwenn
authored andcommitted
fix(filtering): � REPLACEMENT CHARACTER is valid UTF-8
1 parent c9202f0 commit 6022f1c

File tree

2 files changed

+23
-1
lines changed

2 files changed

+23
-1
lines changed

filtering/lexer.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,11 @@ func (l *Lexer) nextRune() (rune, error) {
130130
switch {
131131
case n == 0:
132132
return r, io.EOF
133-
case r == utf8.RuneError:
133+
// If the input rune was the replacement character (`\uFFFD`) preserve it.
134+
//
135+
// If the input rune was invalid (and converted to replacement character)
136+
// return an error.
137+
case r == utf8.RuneError && (n != 3 || l.remainingFilter()[:3] != "\xef\xbf\xbd"):
134138
return r, l.errorf("invalid UTF-8")
135139
}
136140
if r == '\n' {

filtering/lexer_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,24 @@ func TestLexer(t *testing.T) {
315315
filter: "invalid = foo\xa0\x01bar",
316316
errorContains: "invalid UTF-8",
317317
},
318+
{
319+
filter: `object_id = "�g/ml" OR object_id = "µg/ml"`, // replacement character is valid UTF-8
320+
expected: []Token{
321+
{Position: Position{Offset: 0, Column: 1, Line: 1}, Type: TokenTypeText, Value: "object_id"},
322+
{Position: Position{Offset: 9, Column: 10, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
323+
{Position: Position{Offset: 10, Column: 11, Line: 1}, Type: TokenTypeEquals, Value: "="},
324+
{Position: Position{Offset: 11, Column: 12, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
325+
{Position: Position{Offset: 12, Column: 13, Line: 1}, Type: TokenTypeString, Value: `"�g/ml"`},
326+
{Position: Position{Offset: 21, Column: 20, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
327+
{Position: Position{Offset: 22, Column: 21, Line: 1}, Type: TokenTypeOr, Value: "OR"},
328+
{Position: Position{Offset: 24, Column: 23, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
329+
{Position: Position{Offset: 25, Column: 24, Line: 1}, Type: TokenTypeText, Value: "object_id"},
330+
{Position: Position{Offset: 34, Column: 33, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
331+
{Position: Position{Offset: 35, Column: 34, Line: 1}, Type: TokenTypeEquals, Value: "="},
332+
{Position: Position{Offset: 36, Column: 35, Line: 1}, Type: TokenTypeWhitespace, Value: " "},
333+
{Position: Position{Offset: 37, Column: 36, Line: 1}, Type: TokenTypeString, Value: `"µg/ml"`},
334+
},
335+
},
318336
} {
319337
tt := tt
320338
t.Run(tt.filter, func(t *testing.T) {

0 commit comments

Comments
 (0)