Skip to content

Commit 1e48ca7

Browse files
dsnetgopherbot
authored andcommitted
encoding/json: remove legacy option to EscapeInvalidUTF8
In the presence of invalid UTF-8, the AllowInvalidUTF8 option allows such bytes to be present, but silently mangles them using the Unicode replacement character. The v2 default is to emit the replacement character verbatim (which is valid UTF-8 and exactly what it is for). However, the v1 behavior has historically been to emit the escaped form of the replacement character. This behavior was introduced in https://go.dev/cl/11211045 where the documentation says that it is: replacing invalid bytes with the Unicode replacement rune U+FFFD but the implementation actually replaces it with the escaped form of the Unicode replacement rune. Given that the documentation differs from the implementation, the actual behavior is likely an oversight. Given how esoteric of behavior this is, we change the v1in2 behavior to avoid the unnecesary escaping and drop support for EscapeInvalidUTF8. This does not violate the Go compatibility agreement since we do not document what the exact syntactic output is. Also, there has already been prior precedence for changing the output: * [encoding/json: encode \b and \f as '\b' and '\f' in JSON strings](https://go.dev/cl/521675) * [encoding/json: encode \n in strings as "\n", not "\u000A"](https://go.dev/cl/4678046) * [encoding/json: encode \t as \t instead of \u0009](https://go.dev/cl/162340043) * [encoding/json: use standard ES6 formatting for numbers during marshal](https://go.dev/cl/30371) Fixes #74551 Change-Id: Ib59a873c44713d302f1f6ab103ffba2520d63276 Reviewed-on: https://go-review.googlesource.com/c/go/+/687116 Auto-Submit: Joseph Tsai <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Damien Neil <[email protected]> Reviewed-by: Johan Brandhorst-Satzkorn <[email protected]> Reviewed-by: Carlos Amedee <[email protected]>
1 parent a0a99cb commit 1e48ca7

File tree

5 files changed

+9
-33
lines changed

5 files changed

+9
-33
lines changed

src/encoding/json/internal/jsonflags/flags.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ const (
5252
AllowInvalidUTF8 |
5353
EscapeForHTML |
5454
EscapeForJS |
55-
EscapeInvalidUTF8 |
5655
PreserveRawStrings |
5756
Deterministic |
5857
FormatNilMapAsNull |
@@ -77,7 +76,7 @@ const (
7776
WhitespaceFlags = AnyWhitespace | Indent | IndentPrefix
7877

7978
// AnyEscape is the set of flags related to escaping in a JSON string.
80-
AnyEscape = EscapeForHTML | EscapeForJS | EscapeInvalidUTF8
79+
AnyEscape = EscapeForHTML | EscapeForJS
8180

8281
// CanonicalizeNumbers is the set of flags related to raw number canonicalization.
8382
CanonicalizeNumbers = CanonicalizeRawInts | CanonicalizeRawFloats
@@ -97,7 +96,6 @@ const (
9796
ReorderRawObjects // encode only
9897
EscapeForHTML // encode only
9998
EscapeForJS // encode only
100-
EscapeInvalidUTF8 // encode only; only exposed in v1
10199
Multiline // encode only
102100
SpaceAfterColon // encode only
103101
SpaceAfterComma // encode only

src/encoding/json/internal/jsonwire/encode.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,7 @@ func AppendQuote[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflag
9292
case isInvalidUTF8(r, rn):
9393
hasInvalidUTF8 = true
9494
dst = append(dst, src[i:n-rn]...)
95-
if flags.Get(jsonflags.EscapeInvalidUTF8) {
96-
dst = append(dst, `\ufffd`...)
97-
} else {
98-
dst = append(dst, "\ufffd"...)
99-
}
95+
dst = append(dst, "\ufffd"...)
10096
i = n
10197
case (r == '\u2028' || r == '\u2029') && flags.Get(jsonflags.EscapeForJS):
10298
dst = append(dst, src[i:n-rn]...)

src/encoding/json/v2_decode_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,12 +1249,12 @@ func TestMarshalInvalidUTF8(t *testing.T) {
12491249
in string
12501250
want string
12511251
}{
1252-
{Name(""), "hello\xffworld", `"hello\ufffdworld"`},
1252+
{Name(""), "hello\xffworld", "\"hello\ufffdworld\""},
12531253
{Name(""), "", `""`},
1254-
{Name(""), "\xff", `"\ufffd"`},
1255-
{Name(""), "\xff\xff", `"\ufffd\ufffd"`},
1256-
{Name(""), "a\xffb", `"a\ufffdb"`},
1257-
{Name(""), "\xe6\x97\xa5\xe6\x9c\xac\xff\xaa\x9e", `"日本\ufffd\ufffd\ufffd"`},
1254+
{Name(""), "\xff", "\"\ufffd\""},
1255+
{Name(""), "\xff\xff", "\"\ufffd\ufffd\""},
1256+
{Name(""), "a\xffb", "\"a\ufffdb\""},
1257+
{Name(""), "\xe6\x97\xa5\xe6\x9c\xac\xff\xaa\x9e", "\"日本\ufffd\ufffd\ufffd\""},
12581258
}
12591259
for _, tt := range tests {
12601260
t.Run(tt.Name, func(t *testing.T) {

src/encoding/json/v2_diff_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -786,8 +786,8 @@ func TestInvalidUTF8(t *testing.T) {
786786
switch {
787787
case json.Version == "v1" && err != nil:
788788
t.Fatalf("json.Marshal error: %v", err)
789-
case json.Version == "v1" && string(got) != `"\ufffd"`:
790-
t.Fatalf(`json.Marshal = %s, want "\ufffd"`, got)
789+
case json.Version == "v1" && string(got) != "\"\ufffd\"":
790+
t.Fatalf(`json.Marshal = %s, want %q`, got, "\ufffd")
791791
case json.Version == "v2" && err == nil:
792792
t.Fatal("json.Marshal error is nil, want non-nil")
793793
}

src/encoding/json/v2_options.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,6 @@ type Options = jsonopts.Options
204204
// It is equivalent to the following boolean options being set to true:
205205
//
206206
// - [CallMethodsWithLegacySemantics]
207-
// - [EscapeInvalidUTF8]
208207
// - [FormatBytesWithLegacySemantics]
209208
// - [FormatTimeWithLegacySemantics]
210209
// - [MatchCaseSensitiveDelimiter]
@@ -279,23 +278,6 @@ func CallMethodsWithLegacySemantics(v bool) Options {
279278
}
280279
}
281280

282-
// EscapeInvalidUTF8 specifies that when encoding a [jsontext.String]
283-
// with bytes of invalid UTF-8, such bytes are escaped as
284-
// a hexadecimal Unicode codepoint (i.e., \ufffd).
285-
// In contrast, the v2 default is to use the minimal representation,
286-
// which is to encode invalid UTF-8 as the Unicode replacement rune itself
287-
// (without any form of escaping).
288-
//
289-
// This only affects encoding and is ignored when decoding.
290-
// The v1 default is true.
291-
func EscapeInvalidUTF8(v bool) Options {
292-
if v {
293-
return jsonflags.EscapeInvalidUTF8 | 1
294-
} else {
295-
return jsonflags.EscapeInvalidUTF8 | 0
296-
}
297-
}
298-
299281
// FormatBytesWithLegacySemantics specifies that handling of
300282
// []~byte and [N]~byte types follow legacy semantics:
301283
//

0 commit comments

Comments
 (0)