Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions internal/printer/utilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,26 @@ func encodeUtf16EscapeSequence(b *strings.Builder, charCode rune) {
b.WriteString(hexCharCode)
}

// decodeCESU8OrUTF8 decodes a rune from s, recognizing CESU-8 encoded surrogate
// code units (0xD800–0xDFFF) that the scanner produces for lone surrogates in
// string literals. Standard utf8.DecodeRuneInString would replace these with
// U+FFFD, losing the distinction between different surrogates.
func decodeCESU8OrUTF8(s string) (rune, int) {
if len(s) >= 3 && s[0] == 0xED && s[1] >= 0xA0 && s[1] <= 0xBF && s[2] >= 0x80 && s[2] <= 0xBF {
r := rune(0xD000) | rune(s[1]&0x3F)<<6 | rune(s[2]&0x3F)
return r, 3
}
return utf8.DecodeRuneInString(s)
}

// Based heavily on the abstract 'Quote'/'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
// but augmented for a few select characters (e.g. lineSeparator, paragraphSeparator, nextLine)
// Note that this doesn't actually wrap the input in double quotes.
func escapeStringWorker(s string, quoteChar QuoteChar, flags getLiteralTextFlags, b *strings.Builder) {
pos := 0
i := 0
for i < len(s) {
ch, size := utf8.DecodeRuneInString(s[i:])
ch, size := decodeCESU8OrUTF8(s[i:])

escape := false

Expand All @@ -104,7 +116,8 @@ func escapeStringWorker(s string, quoteChar QuoteChar, flags getLiteralTextFlags
escape = true
}
default:
if ch <= '\u001f' || flags&getLiteralTextFlagsNeverAsciiEscape == 0 && ch > '\u007f' {
if ch <= '\u001f' || flags&getLiteralTextFlagsNeverAsciiEscape == 0 && ch > '\u007f' ||
ch >= 0xD800 && ch <= 0xDFFF {
escape = true
}
}
Expand Down
15 changes: 8 additions & 7 deletions internal/scanner/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1749,6 +1749,9 @@ func (s *Scanner) scanEscapeSequence(flags EscapeSequenceScanningFlags) string {
if codePoint < 0 {
return s.text[start:s.pos]
}
if codePointIsHighSurrogate(codePoint) || codePointIsLowSurrogate(codePoint) {
return encodeSurrogate(codePoint)
}
return string(codePoint)
}
if codePoint < 0 {
Expand All @@ -1764,13 +1767,11 @@ func (s *Scanner) scanEscapeSequence(flags EscapeSequenceScanningFlags) string {
return string(surrogatePairToCodepoint(codePoint, nextCodePoint))
}
s.pos = savedPos
if flags&EscapeSequenceScanningFlagsRegularExpression != 0 {
return encodeSurrogate(codePoint)
}
} else if (codePointIsHighSurrogate(codePoint) || codePointIsLowSurrogate(codePoint)) &&
flags&EscapeSequenceScanningFlagsRegularExpression != 0 {
// Lone surrogate inside a non-unicode regex: encode as CESU-8 so scanClassRanges
// can compare surrogates numerically. Must NOT apply to string literals.
return encodeSurrogate(codePoint)
} else if codePointIsHighSurrogate(codePoint) || codePointIsLowSurrogate(codePoint) {
// Lone surrogate: encode as CESU-8 so that distinct surrogates remain
// distinguishable. Go's string(rune) would replace all surrogates with
// U+FFFD, collapsing e.g. "\uD800" and "\uDC00" into the same value.
return encodeSurrogate(codePoint)
Comment on lines +1771 to 1775
}
return string(codePoint)
Expand Down
16 changes: 14 additions & 2 deletions internal/stringutil/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ func CompareStringsCaseInsensitive(a string, b string) Comparison {
return ComparisonEqual
}
for {
ca, sa := utf8.DecodeRuneInString(a)
cb, sb := utf8.DecodeRuneInString(b)
ca, sa := decodeCESU8OrUTF8(a)
cb, sb := decodeCESU8OrUTF8(b)
if sa == 0 {
if sb == 0 {
return ComparisonEqual
Expand All @@ -60,6 +60,18 @@ func CompareStringsCaseInsensitive(a string, b string) Comparison {
}
}

// decodeCESU8OrUTF8 decodes a rune from s, recognizing CESU-8 encoded surrogate
// code units (0xD800–0xDFFF) that the scanner produces for lone surrogates in
// string literals. Standard utf8.DecodeRuneInString would see these as invalid
// UTF-8 and return RuneError for each byte individually.
func decodeCESU8OrUTF8(s string) (rune, int) {
if len(s) >= 3 && s[0] == 0xED && s[1] >= 0xA0 && s[1] <= 0xBF && s[2] >= 0x80 && s[2] <= 0xBF {
r := rune(0xD000) | rune(s[1]&0x3F)<<6 | rune(s[2]&0x3F)
return r, 3
}
return utf8.DecodeRuneInString(s)
}

func CompareStringsCaseSensitive(a string, b string) Comparison {
return strings.Compare(a, b)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
loneSurrogateStringLiterals.ts(6,7): error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
loneSurrogateStringLiterals.ts(7,7): error TS2322: Type '"\uD800"' is not assignable to type '"\uDC00"'.
loneSurrogateStringLiterals.ts(10,7): error TS2322: Type '"\uD801"' is not assignable to type '"\uD800"'.
loneSurrogateStringLiterals.ts(11,7): error TS2322: Type '"\uD800"' is not assignable to type '"\uD801"'.
loneSurrogateStringLiterals.ts(14,7): error TS2322: Type '"\uDC01"' is not assignable to type '"\uDC00"'.
loneSurrogateStringLiterals.ts(15,7): error TS2322: Type '"\uDC00"' is not assignable to type '"\uDC01"'.
loneSurrogateStringLiterals.ts(20,7): error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
loneSurrogateStringLiterals.ts(21,7): error TS2322: Type '"\uD800"' is not assignable to type '"\uDC00"'.
loneSurrogateStringLiterals.ts(26,7): error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
loneSurrogateStringLiterals.ts(27,7): error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.


==== loneSurrogateStringLiterals.ts (10 errors) ====
// Lone surrogates should be distinct string literal types
const highSurrogate: "\uD800" = "\uD800"; // ok
const lowSurrogate: "\uDC00" = "\uDC00"; // ok

// These should be errors - different surrogates are not assignable to each other
const highToLow: "\uD800" = "\uDC00"; // error
~~~~~~~~~
!!! error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
const lowToHigh: "\uDC00" = "\uD800"; // error
~~~~~~~~~
!!! error TS2322: Type '"\uD800"' is not assignable to type '"\uDC00"'.

// Different high surrogates should also be distinct
const high1: "\uD800" = "\uD801"; // error
~~~~~
!!! error TS2322: Type '"\uD801"' is not assignable to type '"\uD800"'.
const high2: "\uD801" = "\uD800"; // error
~~~~~
!!! error TS2322: Type '"\uD800"' is not assignable to type '"\uD801"'.

// Different low surrogates should also be distinct
const low1: "\uDC00" = "\uDC01"; // error
~~~~
!!! error TS2322: Type '"\uDC01"' is not assignable to type '"\uDC00"'.
const low2: "\uDC01" = "\uDC00"; // error
~~~~
!!! error TS2322: Type '"\uDC00"' is not assignable to type '"\uDC01"'.

// Extended Unicode escape syntax should also work
const extHigh: "\u{D800}" = "\u{D800}"; // ok
const extLow: "\u{DC00}" = "\u{DC00}"; // ok
const extHighToLow: "\u{D800}" = "\u{DC00}"; // error
~~~~~~~~~~~~
!!! error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
const extLowToHigh: "\u{DC00}" = "\u{D800}"; // error
~~~~~~~~~~~~
!!! error TS2322: Type '"\uD800"' is not assignable to type '"\uDC00"'.

// Mixed syntax should also be equivalent
const mixedHigh: "\uD800" = "\u{D800}"; // ok
const mixedLow: "\u{DC00}" = "\uDC00"; // ok
const mixedError1: "\uD800" = "\u{DC00}"; // error
~~~~~~~~~~~
!!! error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.
const mixedError2: "\u{D800}" = "\uDC00"; // error
~~~~~~~~~~~
!!! error TS2322: Type '"\uDC00"' is not assignable to type '"\uD800"'.

Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
//// [tests/cases/compiler/loneSurrogateStringLiterals.ts] ////

//// [loneSurrogateStringLiterals.ts]
// Lone surrogates should be distinct string literal types
const highSurrogate: "\uD800" = "\uD800"; // ok
const lowSurrogate: "\uDC00" = "\uDC00"; // ok

// These should be errors - different surrogates are not assignable to each other
const highToLow: "\uD800" = "\uDC00"; // error
const lowToHigh: "\uDC00" = "\uD800"; // error

// Different high surrogates should also be distinct
const high1: "\uD800" = "\uD801"; // error
const high2: "\uD801" = "\uD800"; // error

// Different low surrogates should also be distinct
const low1: "\uDC00" = "\uDC01"; // error
const low2: "\uDC01" = "\uDC00"; // error

// Extended Unicode escape syntax should also work
const extHigh: "\u{D800}" = "\u{D800}"; // ok
const extLow: "\u{DC00}" = "\u{DC00}"; // ok
const extHighToLow: "\u{D800}" = "\u{DC00}"; // error
const extLowToHigh: "\u{DC00}" = "\u{D800}"; // error

// Mixed syntax should also be equivalent
const mixedHigh: "\uD800" = "\u{D800}"; // ok
const mixedLow: "\u{DC00}" = "\uDC00"; // ok
const mixedError1: "\uD800" = "\u{DC00}"; // error
const mixedError2: "\u{D800}" = "\uDC00"; // error


//// [loneSurrogateStringLiterals.js]
"use strict";
// Lone surrogates should be distinct string literal types
const highSurrogate = "\uD800"; // ok
const lowSurrogate = "\uDC00"; // ok
// These should be errors - different surrogates are not assignable to each other
const highToLow = "\uDC00"; // error
const lowToHigh = "\uD800"; // error
// Different high surrogates should also be distinct
const high1 = "\uD801"; // error
const high2 = "\uD800"; // error
// Different low surrogates should also be distinct
const low1 = "\uDC01"; // error
const low2 = "\uDC00"; // error
// Extended Unicode escape syntax should also work
const extHigh = "\u{D800}"; // ok
const extLow = "\u{DC00}"; // ok
const extHighToLow = "\u{DC00}"; // error
const extLowToHigh = "\u{D800}"; // error
// Mixed syntax should also be equivalent
const mixedHigh = "\u{D800}"; // ok
const mixedLow = "\uDC00"; // ok
const mixedError1 = "\u{DC00}"; // error
const mixedError2 = "\uDC00"; // error
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//// [tests/cases/compiler/loneSurrogateStringLiterals.ts] ////

=== loneSurrogateStringLiterals.ts ===
// Lone surrogates should be distinct string literal types
const highSurrogate: "\uD800" = "\uD800"; // ok
>highSurrogate : Symbol(highSurrogate, Decl(loneSurrogateStringLiterals.ts, 1, 5))

const lowSurrogate: "\uDC00" = "\uDC00"; // ok
>lowSurrogate : Symbol(lowSurrogate, Decl(loneSurrogateStringLiterals.ts, 2, 5))

// These should be errors - different surrogates are not assignable to each other
const highToLow: "\uD800" = "\uDC00"; // error
>highToLow : Symbol(highToLow, Decl(loneSurrogateStringLiterals.ts, 5, 5))

const lowToHigh: "\uDC00" = "\uD800"; // error
>lowToHigh : Symbol(lowToHigh, Decl(loneSurrogateStringLiterals.ts, 6, 5))

// Different high surrogates should also be distinct
const high1: "\uD800" = "\uD801"; // error
>high1 : Symbol(high1, Decl(loneSurrogateStringLiterals.ts, 9, 5))

const high2: "\uD801" = "\uD800"; // error
>high2 : Symbol(high2, Decl(loneSurrogateStringLiterals.ts, 10, 5))

// Different low surrogates should also be distinct
const low1: "\uDC00" = "\uDC01"; // error
>low1 : Symbol(low1, Decl(loneSurrogateStringLiterals.ts, 13, 5))

const low2: "\uDC01" = "\uDC00"; // error
>low2 : Symbol(low2, Decl(loneSurrogateStringLiterals.ts, 14, 5))

// Extended Unicode escape syntax should also work
const extHigh: "\u{D800}" = "\u{D800}"; // ok
>extHigh : Symbol(extHigh, Decl(loneSurrogateStringLiterals.ts, 17, 5))

const extLow: "\u{DC00}" = "\u{DC00}"; // ok
>extLow : Symbol(extLow, Decl(loneSurrogateStringLiterals.ts, 18, 5))

const extHighToLow: "\u{D800}" = "\u{DC00}"; // error
>extHighToLow : Symbol(extHighToLow, Decl(loneSurrogateStringLiterals.ts, 19, 5))

const extLowToHigh: "\u{DC00}" = "\u{D800}"; // error
>extLowToHigh : Symbol(extLowToHigh, Decl(loneSurrogateStringLiterals.ts, 20, 5))

// Mixed syntax should also be equivalent
const mixedHigh: "\uD800" = "\u{D800}"; // ok
>mixedHigh : Symbol(mixedHigh, Decl(loneSurrogateStringLiterals.ts, 23, 5))

const mixedLow: "\u{DC00}" = "\uDC00"; // ok
>mixedLow : Symbol(mixedLow, Decl(loneSurrogateStringLiterals.ts, 24, 5))

const mixedError1: "\uD800" = "\u{DC00}"; // error
>mixedError1 : Symbol(mixedError1, Decl(loneSurrogateStringLiterals.ts, 25, 5))

const mixedError2: "\u{D800}" = "\uDC00"; // error
>mixedError2 : Symbol(mixedError2, Decl(loneSurrogateStringLiterals.ts, 26, 5))

Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//// [tests/cases/compiler/loneSurrogateStringLiterals.ts] ////

=== loneSurrogateStringLiterals.ts ===
// Lone surrogates should be distinct string literal types
const highSurrogate: "\uD800" = "\uD800"; // ok
>highSurrogate : "\uD800"
>"\uD800" : "\uD800"

const lowSurrogate: "\uDC00" = "\uDC00"; // ok
>lowSurrogate : "\uDC00"
>"\uDC00" : "\uDC00"

// These should be errors - different surrogates are not assignable to each other
const highToLow: "\uD800" = "\uDC00"; // error
>highToLow : "\uD800"
>"\uDC00" : "\uDC00"

const lowToHigh: "\uDC00" = "\uD800"; // error
>lowToHigh : "\uDC00"
>"\uD800" : "\uD800"

// Different high surrogates should also be distinct
const high1: "\uD800" = "\uD801"; // error
>high1 : "\uD800"
>"\uD801" : "\uD801"

const high2: "\uD801" = "\uD800"; // error
>high2 : "\uD801"
>"\uD800" : "\uD800"

// Different low surrogates should also be distinct
const low1: "\uDC00" = "\uDC01"; // error
>low1 : "\uDC00"
>"\uDC01" : "\uDC01"

const low2: "\uDC01" = "\uDC00"; // error
>low2 : "\uDC01"
>"\uDC00" : "\uDC00"

// Extended Unicode escape syntax should also work
const extHigh: "\u{D800}" = "\u{D800}"; // ok
>extHigh : "\uD800"
>"\u{D800}" : "\uD800"

const extLow: "\u{DC00}" = "\u{DC00}"; // ok
>extLow : "\uDC00"
>"\u{DC00}" : "\uDC00"

const extHighToLow: "\u{D800}" = "\u{DC00}"; // error
>extHighToLow : "\uD800"
>"\u{DC00}" : "\uDC00"

const extLowToHigh: "\u{DC00}" = "\u{D800}"; // error
>extLowToHigh : "\uDC00"
>"\u{D800}" : "\uD800"

// Mixed syntax should also be equivalent
const mixedHigh: "\uD800" = "\u{D800}"; // ok
>mixedHigh : "\uD800"
>"\u{D800}" : "\uD800"

const mixedLow: "\u{DC00}" = "\uDC00"; // ok
>mixedLow : "\uDC00"
>"\uDC00" : "\uDC00"

const mixedError1: "\uD800" = "\u{DC00}"; // error
>mixedError1 : "\uD800"
>"\u{DC00}" : "\uDC00"

const mixedError2: "\u{D800}" = "\uDC00"; // error
>mixedError2 : "\uD800"
>"\uDC00" : "\uDC00"

Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ export const lowHigh = "\ude03\ud83d";

//// [unicodeSurrogatesInStringLiterals.d.ts]
export declare const highLow: "\uD83D\uDE03";
export declare const high: "\uFFFD";
export declare const low: "\uFFFD";
export declare const highHigh: "\uFFFD\uFFFD";
export declare const lowLow: "\uFFFD\uFFFD";
export declare const lowHigh: "\uFFFD\uFFFD";
export declare const high: "\uD83D";
export declare const low: "\uDE03";
export declare const highHigh: "\uD83D\uD83D";
export declare const lowLow: "\uDE03\uDE03";
export declare const lowHigh: "\uDE03\uD83D";
Loading