Skip to content

Commit e5ab750

Browse files
committed
[stdlib] Fix UTF8View.Index less-than operator
1 parent c9278a1 commit e5ab750

File tree

1 file changed

+37
-7
lines changed

1 file changed

+37
-7
lines changed

stdlib/public/core/StringUTF8.swift

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,25 @@ extension String {
227227
&& _coreIndex == core.endIndex
228228
}
229229

230+
/// The number of UTF-8 code units remaining in the buffer before the
231+
/// next unicode scalar value is reached. This simulates calling
232+
/// `index(after: i)` until `i._coreIndex` is incremented, but doesn't
233+
/// need a `_core` reference.
234+
internal var _utf8ContinuationBytesUntilNextUnicodeScalar: Int {
235+
var buffer = _buffer
236+
var count = 0
237+
238+
while true {
239+
let currentUnit = UTF8.CodeUnit(truncatingBitPattern: buffer)
240+
if currentUnit & 0b1100_0000 != 0b1000_0000 {
241+
break
242+
}
243+
count += 1
244+
buffer = Index._nextBuffer(after: buffer)
245+
}
246+
return count
247+
}
248+
230249
/// The value of the buffer when it is empty
231250
internal static var _emptyBuffer: Buffer {
232251
return ~0
@@ -236,7 +255,7 @@ extension String {
236255
internal static var _bufferHiByte: Buffer {
237256
return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8)
238257
}
239-
258+
240259
/// Consume a byte of the given buffer: shift out the low byte
241260
/// and put FF in the high byte
242261
internal static func _nextBuffer(after thisBuffer: Buffer) -> Buffer {
@@ -277,12 +296,21 @@ extension String {
277296
// FIXME: swift-3-indexing-model: range check i?
278297
let currentUnit = UTF8.CodeUnit(truncatingBitPattern: i._buffer)
279298
let hiNibble = currentUnit >> 4
280-
// Map the high nibble of the current code unit into the
281-
// amount by which to increment the UTF-16 index. Only when
282-
// the high nibble is 1111 do we have a surrogate pair.
299+
300+
// Amounts to increment the UTF-16 index based on the high nibble of a
301+
// UTF-8 code unit. If the high nibble is:
302+
//
303+
// - 0b0000-0b0111: U+0000...U+007F: increment the UTF-16 pointer by 1
304+
// - 0b1000-0b1011: UTF-8 continuation byte, do not increment
305+
// the UTF-16 pointer
306+
// - 0b1100-0b1110: U+0080...U+FFFF: increment the UTF-16 pointer by 1
307+
// - 0b1111: U+10000...U+1FFFFF: increment the UTF-16 pointer by 2
283308
let u16Increments = Int(bitPattern:
284309
// 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
285310
0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01)
311+
312+
// Map the high nibble of the current code unit into the
313+
// amount by which to increment the UTF-16 index.
286314
let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3
287315
let nextCoreIndex = i._coreIndex &+ increment
288316
let nextBuffer = Index._nextBuffer(after: i._buffer)
@@ -463,9 +491,11 @@ public func < (
463491
lhs: String.UTF8View.Index,
464492
rhs: String.UTF8View.Index
465493
) -> Bool {
466-
// FIXME: swift-3-indexing-model: tests.
467-
// FIXME: swift-3-indexing-model: this implementation is wrong, it is just a
468-
// temporary HACK.
494+
if lhs._coreIndex == rhs._coreIndex && lhs._buffer != rhs._buffer {
495+
// The index with more continuation bytes remaining before the next
496+
return lhs._utf8ContinuationBytesUntilNextUnicodeScalar >
497+
rhs._utf8ContinuationBytesUntilNextUnicodeScalar
498+
}
469499
return lhs._coreIndex < rhs._coreIndex
470500
}
471501

0 commit comments

Comments
 (0)