Skip to content

Commit 4799994

Browse files
Refactor UTF16.decode for iterator nil guarantee
1 parent 12ec07f commit 4799994

File tree

1 file changed

+24
-62
lines changed

1 file changed

+24
-62
lines changed

stdlib/public/core/Unicode.swift

Lines changed: 24 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -440,14 +440,7 @@ public struct UTF16 : UnicodeCodec {
440440
public init() {}
441441

442442
/// A lookahead buffer for one UTF-16 code unit.
443-
internal var _decodeLookahead: UInt32 = 0
444-
445-
/// Flags with layout: `0b0000_00xy`.
446-
///
447-
/// `y` is the EOF flag.
448-
///
449-
/// `x` is set when `_decodeLookahead` contains a code unit.
450-
internal var _lookaheadFlags: UInt8 = 0
443+
internal var _decodeLookahead: UInt32?
451444

452445
/// Starts or continues decoding a UTF-16 sequence.
453446
///
@@ -493,76 +486,45 @@ public struct UTF16 : UnicodeCodec {
493486
public mutating func decode<
494487
I : IteratorProtocol where I.Element == CodeUnit
495488
>(_ input: inout I) -> UnicodeDecodingResult {
496-
if _lookaheadFlags & 0b01 != 0 {
497-
return .emptyInput
498-
}
499-
500489
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
501490
// length 1. Length 0 does not make sense. Neither does length 2 -- in
502491
// that case the sequence is valid.
503492

504-
var unit0: UInt32
505-
if _fastPath(_lookaheadFlags & 0b10 == 0) {
506-
if let first = input.next() {
507-
unit0 = UInt32(first)
508-
} else {
509-
// Set EOF flag.
510-
_lookaheadFlags |= 0b01
511-
return .emptyInput
512-
}
513-
} else {
514-
// Fetch code unit from the lookahead buffer and note this fact in flags.
515-
unit0 = _decodeLookahead
516-
_lookaheadFlags &= 0b01
493+
let unit0: UInt32
494+
if _fastPath(_decodeLookahead == nil) {
495+
guard let next = input.next() else { return .emptyInput }
496+
unit0 = UInt32(next)
497+
} else { // Consume lookahead first.
498+
unit0 = _decodeLookahead!
499+
_decodeLookahead = nil
517500
}
518501

519502
// A well-formed pair of surrogates looks like this:
520-
// [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
503+
// high-surrogate low-surrogate
504+
// [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
521505

506+
// Common case first, non-surrogate -- just a sequence of 1 code unit.
522507
if _fastPath((unit0 >> 11) != 0b1101_1) {
523-
// Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
524-
// decoding is trivial.
525-
return .scalarValue(UnicodeScalar(unit0))
508+
return .scalarValue(UnicodeScalar(_unchecked: unit0))
526509
}
527510

528-
if _slowPath((unit0 >> 10) == 0b1101_11) {
529-
// `unit0` is a low-surrogate. We have an ill-formed sequence.
530-
return .error
531-
}
532-
533-
// At this point we know that `unit0` is a high-surrogate.
511+
// Ensure `unit0` is a high-surrogate.
512+
guard _fastPath((unit0 >> 10) == 0b1101_10) else { return .error }
534513

535-
var unit1: UInt32
536-
if let second = input.next() {
537-
unit1 = UInt32(second)
538-
} else {
539-
// EOF reached. Set EOF flag.
540-
_lookaheadFlags |= 0b01
514+
// We already have a high-surrogate, so there should be a next code unit.
515+
guard let next = input.next() else { return .error }
516+
let unit1 = UInt32(next)
541517

542-
// We have seen a high-surrogate and EOF, so we have an ill-formed
543-
// sequence.
518+
// `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
519+
guard _fastPath((unit1 >> 10) == 0b1101_11) else {
520+
// Invalid sequence, discard `unit0` and store `unit1` for the next call.
521+
_decodeLookahead = unit1
544522
return .error
545523
}
546524

547-
if _fastPath((unit1 >> 10) == 0b1101_11) {
548-
// `unit1` is a low-surrogate. We have a well-formed surrogate pair.
549-
550-
let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
551-
return .scalarValue(UnicodeScalar(result))
552-
}
553-
554-
// Otherwise, we have an ill-formed sequence. These are the possible
555-
// cases:
556-
//
557-
// * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
558-
//
559-
// * `unit1` is not a surrogate. We have an ill-formed sequence:
560-
// high-surrogate followed by a non-surrogate.
561-
562-
// Save the second code unit in the lookahead buffer.
563-
_decodeLookahead = unit1
564-
_lookaheadFlags |= 0b10
565-
return .error
525+
// We have a well-formed surrogate pair, decode it.
526+
let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
527+
return .scalarValue(UnicodeScalar(_unchecked: result))
566528
}
567529

568530
/// Try to decode one Unicode scalar, and return the actual number of code

0 commit comments

Comments
 (0)