Skip to content

Commit 6f44f83

Browse files
mooreryanlpil
authored andcommitted
Fix non-character handling in string.utf_codepoint
Treats `U+FFFE` and `U+FFFF` as valid Unicode codepoints rather than errors. See #778.
1 parent c5d0ede commit 6f44f83

File tree

3 files changed

+31
-4
lines changed

3 files changed

+31
-4
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
- The deprecated `function.compose`, `function.constant`, `function.apply*`,
77
`function.curry*`, `result.nil_error`, `list.concat`, `bool.compare`, and
88
`bool.to_int` functions have been removed.
9+
- Fixed a bug where `string.utf_codepoint` would treat valid Unicode codepoints
10+
`U+FFFE` and `U+FFFF` as invalid.
911

1012
## v0.51.0 - 2024-12-22
1113

src/gleam/string.gleam

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,6 @@ pub fn from_utf_codepoints(utf_codepoints: List(UtfCodepoint)) -> String
809809
pub fn utf_codepoint(value: Int) -> Result(UtfCodepoint, Nil) {
810810
case value {
811811
i if i > 1_114_111 -> Error(Nil)
812-
65_534 | 65_535 -> Error(Nil)
813812
i if i >= 55_296 && i <= 57_343 -> Error(Nil)
814813
i if i < 0 -> Error(Nil)
815814
i -> Ok(unsafe_int_to_utf_codepoint(i))

test/gleam/string_test.gleam

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -702,17 +702,43 @@ pub fn from_utf_codepoints_test() {
702702
}
703703

704704
pub fn utf_codepoint_test() {
705-
string.utf_codepoint(1_114_444)
705+
// Less than the lower bound on valid codepoints
706+
string.utf_codepoint(-1)
706707
|> should.be_error
707708

708-
string.utf_codepoint(65_534)
709+
// The lower bound on valid codepoints
710+
string.utf_codepoint(0)
711+
|> should.be_ok
712+
713+
// The upper bound for valid code points
714+
string.utf_codepoint(1_114_111)
715+
|> should.be_ok
716+
717+
// Greater than the upper bound on valid codepoints
718+
string.utf_codepoint(1_114_112)
709719
|> should.be_error
710720

721+
// Non-characters U+FFFE and U+FFFF are valid codepoints. See (#778).
722+
string.utf_codepoint(65_534)
723+
|> should.be_ok
724+
string.utf_codepoint(65_535)
725+
|> should.be_ok
726+
727+
// One less than the lowest "High-surrogate code point"
728+
string.utf_codepoint(55_295)
729+
|> should.be_ok
730+
731+
// Lowest value of the "High-surrogate code point" (U+D800 to U+DBFF)
711732
string.utf_codepoint(55_296)
712733
|> should.be_error
713734

714-
string.utf_codepoint(-1)
735+
// Highest value of the "Low-surrogate code point" (U+DC00 to U+DFFF)
736+
string.utf_codepoint(57_343)
715737
|> should.be_error
738+
739+
// One greater than the highest "Low-surrogate code point"
740+
string.utf_codepoint(57_344)
741+
|> should.be_ok
716742
}
717743

718744
pub fn bit_array_utf_codepoint_test() {

0 commit comments

Comments
 (0)