Skip to content

Commit 83df814

Browse files
committed
[stdlib] _StringObject.isKnownUTF16 → isForeignUTF8
This fixes a compatibility issue with potential future UTF-8 encoded foreign String forms, as well as simplifying the code a bit — we no longer need to do an availability check on inlinable fast paths. The isForeignUTF8 bit is never set by any past or current stdlib version, but it allows us to introduce UTF-8 encoded foreign forms without breaking inlinable index encoding validation introduced in Swift 5.7.
1 parent 7121600 commit 83df814

8 files changed

+103
-167
lines changed

stdlib/public/core/StringBridge.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,8 +616,7 @@ extension String {
616616
let gutsCountAndFlags = _guts._object._countAndFlags
617617
let countAndFlags = _StringObject.CountAndFlags(
618618
sharedCount: _guts.count,
619-
isASCII: gutsCountAndFlags.isASCII,
620-
isUTF16: false)
619+
isASCII: gutsCountAndFlags.isASCII)
621620
return __SharedStringStorage(
622621
immortal: _guts._object.fastUTF8.baseAddress!,
623622
countAndFlags: countAndFlags)

stdlib/public/core/StringCharacterView.swift

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,14 @@ extension String: BidirectionalCollection {
5555
/// - Returns: The index value immediately after `i`.
5656
public func index(after i: Index) -> Index {
5757
let i = _guts.roundDownToNearestCharacter(_guts.validateScalarIndex(i))
58-
let r = _uncheckedIndex(after: i)
59-
return _guts.internalMarkEncoding(r)
58+
return _uncheckedIndex(after: i)
6059
}
6160

6261
/// A version of `index(after:)` that assumes that the given index:
6362
///
6463
/// - has the right encoding,
6564
/// - is within bounds, and
6665
/// - is scalar aligned.
67-
///
68-
/// It does not mark the encoding of the returned index.
6966
internal func _uncheckedIndex(after i: Index) -> Index {
7067
_internalInvariant(_guts.hasMatchingEncoding(i))
7168
_internalInvariant(i < endIndex)
@@ -77,7 +74,7 @@ extension String: BidirectionalCollection {
7774
let nextIndex = Index(_encodedOffset: nextOffset)._characterAligned
7875
let nextStride = _characterStride(startingAt: nextIndex)
7976
let r = Index(encodedOffset: nextOffset, characterStride: nextStride)
80-
return _guts.internalMarkEncoding(r._characterAligned)
77+
return _guts.markEncoding(r._characterAligned)
8178
}
8279

8380
/// Returns the position immediately before the given index.
@@ -92,17 +89,14 @@ extension String: BidirectionalCollection {
9289
// the `i > startIndex` check needs to come after rounding.
9390
_precondition(i > startIndex, "String index is out of bounds")
9491

95-
let r = _uncheckedIndex(before: i)
96-
return _guts.internalMarkEncoding(r)
92+
return _uncheckedIndex(before: i)
9793
}
9894

9995
/// A version of `index(before:)` that assumes that the given index:
10096
///
10197
/// - has the right encoding,
10298
/// - is within bounds, and
10399
/// - is character aligned.
104-
///
105-
/// It does not mark the encoding of the returned index.
106100
internal func _uncheckedIndex(before i: Index) -> Index {
107101
_internalInvariant(_guts.hasMatchingEncoding(i))
108102
_internalInvariant(i > startIndex && i <= endIndex)
@@ -113,7 +107,7 @@ extension String: BidirectionalCollection {
113107
let priorOffset = i._encodedOffset &- stride
114108

115109
let r = Index(encodedOffset: priorOffset, characterStride: stride)
116-
return r._characterAligned
110+
return _guts.markEncoding(r._characterAligned)
117111
}
118112

119113
/// Returns an index that is the specified distance from the given index.
@@ -158,7 +152,7 @@ extension String: BidirectionalCollection {
158152
i = _uncheckedIndex(before: i)
159153
}
160154
}
161-
return _guts.internalMarkEncoding(i)
155+
return i
162156
}
163157

164158
/// Returns an index that is the specified distance from the given index,
@@ -238,7 +232,7 @@ extension String: BidirectionalCollection {
238232
}
239233
guard limit > start || i >= limit else { return nil }
240234
}
241-
return _guts.internalMarkEncoding(i)
235+
return i
242236
}
243237

244238
/// Returns the distance between two indices.

stdlib/public/core/StringGuts.swift

Lines changed: 27 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -311,55 +311,36 @@ func _isSwiftStdlib_5_7() -> Bool {
311311

312312
// Encoding
313313
extension _StringGuts {
314-
/// Returns whether this string is known to use UTF-16 code units.
314+
/// Returns whether this string has a UTF-8 storage representation.
315315
///
316-
/// This always returns a value corresponding to the string's actual encoding
317-
/// on stdlib versions >=5.7.
316+
/// This always returns a value corresponding to the string's actual encoding.
317+
@_alwaysEmitIntoClient
318+
@inline(__always)
319+
internal var isUTF8: Bool { _object.isUTF8 }
320+
321+
/// Returns whether this string has a UTF-16 storage representation.
318322
///
319-
/// Standard Library versions <=5.6 did not set the corresponding flag, so
320-
/// this property always returns false.
323+
/// This always returns a value corresponding to the string's actual encoding.
321324
@_alwaysEmitIntoClient
322325
@inline(__always)
323-
internal var isKnownUTF16: Bool { _object.isKnownUTF16 }
326+
internal var isUTF16: Bool { _object.isUTF16 }
324327

325328
@_alwaysEmitIntoClient // Swift 5.7
326329
internal func markEncoding(_ i: String.Index) -> String.Index {
327-
// In this inlinable function, we cannot assume that all foreign strings are
328-
// UTF-16 encoded, as this code may run on a future stdlib that may have
329-
// introduced other foreign forms.
330-
if #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *) { // SwiftStdlib 5.7
331-
// With a >=5.7 stdlib, we can rely on `isKnownUTF16` to contain the truth.
332-
return isKnownUTF16 ? i._knownUTF16 : i._knownUTF8
333-
}
334-
// We know that in stdlibs 5.0..<5.7, all foreign strings were UTF-16,
335-
// so we can use `isForeign` to determine the encoding.
336-
return isForeign ? i._knownUTF16 : i._knownUTF8
337-
}
338-
339-
@inline(__always)
340-
internal func internalMarkEncoding(_ i: String.Index) -> String.Index {
341-
// This code is behind a resiliance boundary, so it always runs on a >=5.7
342-
// stdlib. Note though that it doesn't match the 5.7+ case in the inlinable
343-
// version above!
344-
//
345-
// We know that in this version of the stdlib, foreign strings happen to
346-
// always be UTF-16 encoded (like they were between 5.0 and 5.6), and
347-
// looking at `isForeign` instead of `isKnownUTF16` may allow the stdlib's
348-
// internal code to be better optimized -- so let's do that.
349-
isForeign ? i._knownUTF16 : i._knownUTF8
330+
isUTF8 ? i._knownUTF8 : i._knownUTF16
350331
}
351332

352333
/// Returns true if the encoding of the given index isn't known to be in
353334
/// conflict with this string's encoding.
354335
///
355-
/// If the index or the string was created by code that was built on stdlibs
356-
/// below 5.7, then this check may incorrectly return true on a mismatching
357-
/// index, but it is guaranteed to never incorrectly return false. If all
358-
/// loaded binaries were built in 5.7+, then this method is guaranteed to
359-
/// always return the correct value.
360-
@_alwaysEmitIntoClient
336+
/// If the index was created by code that was built on a stdlib below 5.7,
337+
/// then this check may incorrectly return true on a mismatching index, but it
338+
/// is guaranteed to never incorrectly return false. If all loaded binaries
339+
/// were built in 5.7+, then this method is guaranteed to always return the
340+
/// correct value.
341+
@_alwaysEmitIntoClient @inline(__always)
361342
internal func hasMatchingEncoding(_ i: String.Index) -> Bool {
362-
(isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8)
343+
isUTF8 ? i._canBeUTF8 : i._canBeUTF16
363344
}
364345

365346
/// Return an index whose encoding can be assumed to match that of `self`.
@@ -371,22 +352,20 @@ extension _StringGuts {
371352
@_alwaysEmitIntoClient
372353
@inline(__always)
373354
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
374-
if _fastPath(!isForeign && i._canBeUTF8) { return i }
355+
if _fastPath(hasMatchingEncoding(i)) { return i }
375356
return _slowEnsureMatchingEncoding(i)
376357
}
377358

378359
@_alwaysEmitIntoClient
379360
@inline(never)
380361
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
381-
_internalInvariant(isForeign || !i._canBeUTF8)
382-
if isForeign {
383-
// Opportunistically detect attempts to use an UTF-8 index on a UTF-16
384-
// string. Strings don't usually get converted to UTF-16 storage, so it
385-
// seems okay to trap in this case -- the index most likely comes from an
386-
// unrelated string. (Trapping here may still turn out to affect binary
387-
// compatibility with broken code in existing binaries running with new
388-
// stdlibs. If so, we can replace this with the same transcoding hack as
389-
// in the UTF-16->8 case below.)
362+
guard isUTF8 else {
363+
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
364+
// get converted to UTF-16 storage, so it seems okay to trap in this case
365+
// -- the index most likely comes from an unrelated string. (Trapping here
366+
// may still turn out to affect binary compatibility with broken code in
367+
// existing binaries running with new stdlibs. If so, we can replace this
368+
// with the same transcoding hack as in the UTF-16->8 case below.)
390369
//
391370
// Note that this trap is not guaranteed to trigger when the process
392371
// includes client binaries compiled with a previous Swift release.
@@ -397,13 +376,9 @@ extension _StringGuts {
397376
//
398377
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
399378
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
400-
//
401-
_precondition(!isKnownUTF16 || i._canBeUTF16,
402-
"Invalid string index")
403-
return i
379+
_preconditionFailure("Invalid string index")
404380
}
405-
// If we get here, then we know for sure that this is an attempt to use an
406-
// UTF-16 index on a UTF-8 string.
381+
// Attempt to use an UTF-16 index on a UTF-8 string.
407382
//
408383
// This can happen if `self` was originally verbatim-bridged, and someone
409384
// mistakenly attempts to keep using an old index after a mutation. This is

stdlib/public/core/StringGutsRangeReplaceable.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ extension _StringGuts {
466466
_internalInvariant(
467467
subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex)
468468

469-
if _slowPath(isKnownUTF16) {
469+
if _slowPath(isUTF16) {
470470
// UTF-16 (i.e., foreign) string. The mutation will convert this to the
471471
// native UTF-8 encoding, so we need to do some extra work to preserve our
472472
// bounds.
@@ -479,7 +479,7 @@ extension _StringGuts {
479479
from: subrange.lowerBound, to: subrange.upperBound)
480480

481481
let newUTF8Subrange = body(&self)
482-
_internalInvariant(!isKnownUTF16)
482+
_internalInvariant(isUTF8)
483483

484484
let newUTF8Count =
485485
oldUTF8Count + newUTF8Subrange.count - oldUTF8SubrangeCount

stdlib/public/core/StringIndex.swift

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -364,14 +364,14 @@ extension String.Index {
364364
// this way: position zero is the same no matter how what encoding is used for
365365
// the rest of string.)
366366
//
367-
// These two bits (along with the isKnownUTF16 flag in StringObject) allow newer
368-
// versions of the Standard Library to more reliably catch runtime errors where
369-
// client code is applying an index from a UTF-16 string to a UTF-8 one, or vice
370-
// versa. This typically happens when indices from a UTF-16 Cocoa string that
371-
// was verbatim bridged into Swift are accidentally applied to a mutated version
372-
// of the same string. (The mutation turns it into a UTF-8 native string, where
373-
// the same numerical offsets might correspond to wildly different logical
374-
// positions.)
367+
// These two bits (along with the isForeignUTF8 flag in StringObject) allow
368+
// newer versions of the Standard Library to more reliably catch runtime errors
369+
// where client code is applying an index from a UTF-16 string to a UTF-8 one,
370+
// or vice versa. This typically happens when indices from a UTF-16 Cocoa string
371+
// that was verbatim bridged into Swift are accidentally applied to a mutated
372+
// version of the same string. (The mutation turns it into a UTF-8 native
373+
// string, where the same numerical offsets might correspond to wildly different
374+
// logical positions.)
375375
//
376376
// Such code has always been broken, as the old indices are documented to be no
377377
// longer valid after the mutation; however, in previous releases such cases

0 commit comments

Comments
 (0)