@@ -440,14 +440,7 @@ public struct UTF16 : UnicodeCodec {
440
440
public init ( ) { }
441
441
442
442
/// A lookahead buffer for one UTF-16 code unit.
443
- internal var _decodeLookahead : UInt32 = 0
444
-
445
- /// Flags with layout: `0b0000_00xy`.
446
- ///
447
- /// `y` is the EOF flag.
448
- ///
449
- /// `x` is set when `_decodeLookahead` contains a code unit.
450
- internal var _lookaheadFlags : UInt8 = 0
443
+ internal var _decodeLookahead : UInt32 ?
451
444
452
445
/// Starts or continues decoding a UTF-16 sequence.
453
446
///
@@ -493,76 +486,45 @@ public struct UTF16 : UnicodeCodec {
493
486
public mutating func decode<
494
487
I : IteratorProtocol where I. Element == CodeUnit
495
488
> ( _ input: inout I ) -> UnicodeDecodingResult {
496
- if _lookaheadFlags & 0b01 != 0 {
497
- return . emptyInput
498
- }
499
-
500
489
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
501
490
// length 1. Length 0 does not make sense. Neither does length 2 -- in
502
491
// that case the sequence is valid.
503
492
504
- var unit0 : UInt32
505
- if _fastPath ( _lookaheadFlags & 0b10 == 0 ) {
506
- if let first = input. next ( ) {
507
- unit0 = UInt32 ( first)
508
- } else {
509
- // Set EOF flag.
510
- _lookaheadFlags |= 0b01
511
- return . emptyInput
512
- }
513
- } else {
514
- // Fetch code unit from the lookahead buffer and note this fact in flags.
515
- unit0 = _decodeLookahead
516
- _lookaheadFlags &= 0b01
493
+ let unit0 : UInt32
494
+ if _fastPath ( _decodeLookahead == nil ) {
495
+ guard let next = input. next ( ) else { return . emptyInput }
496
+ unit0 = UInt32 ( next)
497
+ } else { // Consume lookahead first.
498
+ unit0 = _decodeLookahead!
499
+ _decodeLookahead = nil
517
500
}
518
501
519
502
// A well-formed pair of surrogates looks like this:
520
- // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
503
+ // high-surrogate low-surrogate
504
+ // [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
521
505
506
+ // Common case first, non-surrogate -- just a sequence of 1 code unit.
522
507
if _fastPath ( ( unit0 >> 11 ) != 0b1101_1 ) {
523
- // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
524
- // decoding is trivial.
525
- return . scalarValue( UnicodeScalar ( unit0) )
508
+ return . scalarValue( UnicodeScalar ( _unchecked: unit0) )
526
509
}
527
510
528
- if _slowPath ( ( unit0 >> 10 ) == 0b1101_11 ) {
529
- // `unit0` is a low-surrogate. We have an ill-formed sequence.
530
- return . error
531
- }
532
-
533
- // At this point we know that `unit0` is a high-surrogate.
511
+ // Ensure `unit0` is a high-surrogate.
512
+ guard _fastPath ( ( unit0 >> 10 ) == 0b1101_10 ) else { return . error }
534
513
535
- var unit1 : UInt32
536
- if let second = input. next ( ) {
537
- unit1 = UInt32 ( second)
538
- } else {
539
- // EOF reached. Set EOF flag.
540
- _lookaheadFlags |= 0b01
514
+ // We already have a high-surrogate, so there should be a next code unit.
515
+ guard let next = input. next ( ) else { return . error }
516
+ let unit1 = UInt32 ( next)
541
517
542
- // We have seen a high-surrogate and EOF, so we have an ill-formed
543
- // sequence.
518
+ // `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
519
+ guard _fastPath ( ( unit1 >> 10 ) == 0b1101_11 ) else {
520
+ // Invalid sequence, discard `unit0` and store `unit1` for the next call.
521
+ _decodeLookahead = unit1
544
522
return . error
545
523
}
546
524
547
- if _fastPath ( ( unit1 >> 10 ) == 0b1101_11 ) {
548
- // `unit1` is a low-surrogate. We have a well-formed surrogate pair.
549
-
550
- let result = 0x10000 + ( ( ( unit0 & 0x03ff ) << 10 ) | ( unit1 & 0x03ff ) )
551
- return . scalarValue( UnicodeScalar ( result) )
552
- }
553
-
554
- // Otherwise, we have an ill-formed sequence. These are the possible
555
- // cases:
556
- //
557
- // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
558
- //
559
- // * `unit1` is not a surrogate. We have an ill-formed sequence:
560
- // high-surrogate followed by a non-surrogate.
561
-
562
- // Save the second code unit in the lookahead buffer.
563
- _decodeLookahead = unit1
564
- _lookaheadFlags |= 0b10
565
- return . error
525
+ // We have a well-formed surrogate pair, decode it.
526
+ let result = 0x10000 + ( ( ( unit0 & 0x03ff ) << 10 ) | ( unit1 & 0x03ff ) )
527
+ return . scalarValue( UnicodeScalar ( _unchecked: result) )
566
528
}
567
529
568
530
/// Try to decode one Unicode scalar, and return the actual number of code
0 commit comments