@@ -101,15 +101,15 @@ public protocol UnicodeCodec {
101
101
/// print(scalars)
102
102
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
103
103
///
104
- /// - Parameter next : An iterator of code units to be decoded. `next ` must be
104
+ /// - Parameter input : An iterator of code units to be decoded. `input ` must be
105
105
/// the same iterator instance in repeated calls to this method. Do not
106
106
/// advance the iterator or any copies of the iterator outside this
107
107
/// method.
108
108
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
109
109
/// Unicode scalar, an indication of an error, or an indication that the
110
110
/// UTF sequence has been fully decoded.
111
111
mutating func decode< I : IteratorProtocol > (
112
- _ next : inout I
112
+ _ input : inout I
113
113
) -> UnicodeDecodingResult where I. Element == CodeUnit
114
114
115
115
/// Encodes a Unicode scalar as a series of code units by calling the given
@@ -162,10 +162,6 @@ public struct UTF8 : UnicodeCodec {
162
162
/// The number of bits in `_decodeBuffer` that are current filled.
163
163
internal var _bitsInBuffer : UInt8 = 0
164
164
165
- /// Whether we have exhausted the iterator. Note that this doesn't mean
166
- /// we are done decoding, as there might still be bytes left in the buffer.
167
- internal var _didExhaustIterator : Bool = false
168
-
169
165
/// Starts or continues decoding a UTF-8 sequence.
170
166
///
171
167
/// To decode a code unit sequence completely, call this method repeatedly
@@ -200,53 +196,49 @@ public struct UTF8 : UnicodeCodec {
200
196
/// print(scalars)
201
197
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
202
198
///
203
- /// - Parameter next : An iterator of code units to be decoded. `next ` must be
199
+ /// - Parameter input : An iterator of code units to be decoded. `input ` must be
204
200
/// the same iterator instance in repeated calls to this method. Do not
205
201
/// advance the iterator or any copies of the iterator outside this
206
202
/// method.
207
203
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
208
204
/// Unicode scalar, an indication of an error, or an indication that the
209
205
/// UTF sequence has been fully decoded.
210
206
public mutating func decode< I : IteratorProtocol > (
211
- _ next : inout I
207
+ _ input : inout I
212
208
) -> UnicodeDecodingResult where I. Element == CodeUnit {
213
209
214
- refillBuffer: if !_didExhaustIterator {
215
- // Bufferless ASCII fastpath.
216
- if _fastPath ( _bitsInBuffer == 0 ) {
217
- if let codeUnit = next. next ( ) {
218
- if codeUnit & 0x80 == 0 {
219
- return . scalarValue( UnicodeScalar ( _unchecked: UInt32 ( codeUnit) ) )
220
- }
221
- // Non-ASCII, proceed to buffering mode.
222
- _decodeBuffer = UInt32 ( codeUnit)
223
- _bitsInBuffer = 8
224
- } else {
225
- _didExhaustIterator = true
226
- return . emptyInput
227
- }
228
- } else if ( _decodeBuffer & 0x80 == 0 ) {
229
- // ASCII in buffer. We don't refill the buffer so we can return
230
- // to bufferless mode once we've exhausted it.
231
- break refillBuffer
210
+ // Bufferless ASCII fastpath.
211
+ if _fastPath ( _bitsInBuffer == 0 ) {
212
+ guard let codeUnit = input. next ( ) else { return . emptyInput }
213
+ // ASCII, return immediately.
214
+ if codeUnit & 0x80 == 0 {
215
+ return . scalarValue( UnicodeScalar ( _unchecked: UInt32 ( codeUnit) ) )
232
216
}
233
- // Buffering mode.
234
- // Fill buffer back to 4 bytes (or as many as are left in the iterator).
235
- _sanityCheck ( _bitsInBuffer < 32 )
236
- repeat {
237
- if let codeUnit = next. next ( ) {
238
- // We use & 0x1f to make the compiler omit a bounds check branch.
239
- _decodeBuffer |= ( UInt32 ( codeUnit) << UInt32 ( _bitsInBuffer & 0x1f ) )
240
- _bitsInBuffer = _bitsInBuffer &+ 8
241
- } else {
242
- _didExhaustIterator = true
243
- if _bitsInBuffer == 0 { return . emptyInput }
244
- break // We still have some bytes left in our buffer.
245
- }
246
- } while _bitsInBuffer < 32
247
- } else if _bitsInBuffer == 0 {
248
- return . emptyInput
217
+ // Non-ASCII, proceed to buffering mode.
218
+ _decodeBuffer = UInt32 ( codeUnit)
219
+ _bitsInBuffer = 8
220
+ } else if ( _decodeBuffer & 0x80 == 0 ) {
221
+ // ASCII in buffer. We don't refill the buffer so we can return
222
+ // to bufferless mode once we've exhausted it.
223
+ let codeUnit = _decodeBuffer & 0xff
224
+ _decodeBuffer >>= 8
225
+ _bitsInBuffer = _bitsInBuffer &- 8
226
+ return . scalarValue( UnicodeScalar ( _unchecked: codeUnit) )
249
227
}
228
+ // Buffering mode.
229
+ // Fill buffer back to 4 bytes (or as many as are left in the iterator).
230
+ _sanityCheck ( _bitsInBuffer < 32 )
231
+ repeat {
232
+ if let codeUnit = input. next ( ) {
233
+ // We know _bitsInBuffer < 32 so we use `& 0x1f` (31) to make the
234
+ // compiler omit a bounds check branch for the bitshift.
235
+ _decodeBuffer |= ( UInt32 ( codeUnit) << UInt32 ( _bitsInBuffer & 0x1f ) )
236
+ _bitsInBuffer = _bitsInBuffer &+ 8
237
+ } else {
238
+ if _bitsInBuffer == 0 { return . emptyInput }
239
+ break // We still have some bytes left in our buffer.
240
+ }
241
+ } while _bitsInBuffer < 32
250
242
251
243
// Decode one unicode scalar.
252
244
// Note our empty bytes are always 0x00, which is required for this call.
@@ -257,16 +249,13 @@ public struct UTF8 : UnicodeCodec {
257
249
_sanityCheck ( 1 ... 4 ~= length && bitsConsumed <= _bitsInBuffer)
258
250
// Swift doesn't allow shifts greater than or equal to the type width.
259
251
// _decodeBuffer >>= UInt32(bitsConsumed) // >>= 32 crashes.
260
- // Mask with 0x3f to let the compiler omit the '>= 64' bounds check.
252
+ // Mask with 0x3f (63) to let the compiler omit the '>= 64' bounds check.
261
253
_decodeBuffer = UInt32 ( truncatingBitPattern:
262
254
UInt64 ( _decodeBuffer) >> ( UInt64 ( bitsConsumed) & 0x3f ) )
263
255
_bitsInBuffer = _bitsInBuffer &- bitsConsumed
264
256
265
- if _fastPath ( result != nil ) {
266
- return . scalarValue( UnicodeScalar ( _unchecked: result!) )
267
- } else {
268
- return . error // Ill-formed UTF-8 code unit sequence.
269
- }
257
+ guard _fastPath ( result != nil ) else { return . error }
258
+ return . scalarValue( UnicodeScalar ( _unchecked: result!) )
270
259
}
271
260
272
261
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
@@ -451,14 +440,7 @@ public struct UTF16 : UnicodeCodec {
451
440
public init ( ) { }
452
441
453
442
/// A lookahead buffer for one UTF-16 code unit.
454
- internal var _decodeLookahead : UInt32 = 0
455
-
456
- /// Flags with layout: `0b0000_00xy`.
457
- ///
458
- /// `y` is the EOF flag.
459
- ///
460
- /// `x` is set when `_decodeLookahead` contains a code unit.
461
- internal var _lookaheadFlags : UInt8 = 0
443
+ internal var _decodeLookahead : UInt16 ?
462
444
463
445
/// Starts or continues decoding a UTF-16 sequence.
464
446
///
@@ -494,7 +476,7 @@ public struct UTF16 : UnicodeCodec {
494
476
/// print(scalars)
495
477
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
496
478
///
497
- /// - Parameter next : An iterator of code units to be decoded. `next ` must be
479
+ /// - Parameter input : An iterator of code units to be decoded. `input ` must be
498
480
/// the same iterator instance in repeated calls to this method. Do not
499
481
/// advance the iterator or any copies of the iterator outside this
500
482
/// method.
@@ -504,76 +486,44 @@ public struct UTF16 : UnicodeCodec {
504
486
public mutating func decode< I : IteratorProtocol > (
505
487
_ input: inout I
506
488
) -> UnicodeDecodingResult where I. Element == CodeUnit {
507
- if _lookaheadFlags & 0b01 != 0 {
508
- return . emptyInput
509
- }
510
-
511
489
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
512
490
// length 1. Length 0 does not make sense. Neither does length 2 -- in
513
491
// that case the sequence is valid.
514
492
515
- var unit0 : UInt32
516
- if _fastPath ( _lookaheadFlags & 0b10 == 0 ) {
517
- if let first = input. next ( ) {
518
- unit0 = UInt32 ( first)
519
- } else {
520
- // Set EOF flag.
521
- _lookaheadFlags |= 0b01
522
- return . emptyInput
523
- }
524
- } else {
525
- // Fetch code unit from the lookahead buffer and note this fact in flags.
526
- unit0 = _decodeLookahead
527
- _lookaheadFlags &= 0b01
493
+ let unit0 : UInt16
494
+ if _fastPath ( _decodeLookahead == nil ) {
495
+ guard let next = input. next ( ) else { return . emptyInput }
496
+ unit0 = next
497
+ } else { // Consume lookahead first.
498
+ unit0 = _decodeLookahead!
499
+ _decodeLookahead = nil
528
500
}
529
501
530
502
// A well-formed pair of surrogates looks like this:
531
- // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
503
+ // high-surrogate low-surrogate
504
+ // [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
532
505
506
+ // Common case first, non-surrogate -- just a sequence of 1 code unit.
533
507
if _fastPath ( ( unit0 >> 11 ) != 0b1101_1 ) {
534
- // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
535
- // decoding is trivial.
536
- return . scalarValue( UnicodeScalar ( unit0) )
537
- }
538
-
539
- if _slowPath ( ( unit0 >> 10 ) == 0b1101_11 ) {
540
- // `unit0` is a low-surrogate. We have an ill-formed sequence.
541
- return . error
508
+ return . scalarValue( UnicodeScalar ( _unchecked: UInt32 ( unit0) ) )
542
509
}
543
510
544
- // At this point we know that `unit0` is a high-surrogate.
511
+ // Ensure `unit0` is a high-surrogate.
512
+ guard _fastPath ( ( unit0 >> 10 ) == 0b1101_10 ) else { return . error }
545
513
546
- var unit1 : UInt32
547
- if let second = input. next ( ) {
548
- unit1 = UInt32 ( second)
549
- } else {
550
- // EOF reached. Set EOF flag.
551
- _lookaheadFlags |= 0b01
514
+ // We already have a high-surrogate, so there should be a next code unit.
515
+ guard let unit1 = input. next ( ) else { return . error }
552
516
553
- // We have seen a high-surrogate and EOF, so we have an ill-formed
554
- // sequence.
517
+ // `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
518
+ guard _fastPath ( ( unit1 >> 10 ) == 0b1101_11 ) else {
519
+ // Invalid sequence, discard `unit0` and store `unit1` for the next call.
520
+ _decodeLookahead = unit1
555
521
return . error
556
522
}
557
523
558
- if _fastPath ( ( unit1 >> 10 ) == 0b1101_11 ) {
559
- // `unit1` is a low-surrogate. We have a well-formed surrogate pair.
560
-
561
- let result = 0x10000 + ( ( ( unit0 & 0x03ff ) << 10 ) | ( unit1 & 0x03ff ) )
562
- return . scalarValue( UnicodeScalar ( result) )
563
- }
564
-
565
- // Otherwise, we have an ill-formed sequence. These are the possible
566
- // cases:
567
- //
568
- // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
569
- //
570
- // * `unit1` is not a surrogate. We have an ill-formed sequence:
571
- // high-surrogate followed by a non-surrogate.
572
-
573
- // Save the second code unit in the lookahead buffer.
574
- _decodeLookahead = unit1
575
- _lookaheadFlags |= 0b10
576
- return . error
524
+ // We have a well-formed surrogate pair, decode it.
525
+ let result = 0x10000 + ( ( UInt32 ( unit0 & 0x03ff ) << 10 ) | UInt32 ( unit1 & 0x03ff ) )
526
+ return . scalarValue( UnicodeScalar ( _unchecked: result) )
577
527
}
578
528
579
529
/// Try to decode one Unicode scalar, and return the actual number of code
@@ -672,7 +622,7 @@ public struct UTF32 : UnicodeCodec {
672
622
/// print(scalars)
673
623
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
674
624
///
675
- /// - Parameter next : An iterator of code units to be decoded. `next ` must be
625
+ /// - Parameter input : An iterator of code units to be decoded. `input ` must be
676
626
/// the same iterator instance in repeated calls to this method. Do not
677
627
/// advance the iterator or any copies of the iterator outside this
678
628
/// method.
@@ -689,11 +639,11 @@ public struct UTF32 : UnicodeCodec {
689
639
_ input: inout I
690
640
) -> UnicodeDecodingResult where I. Element == CodeUnit {
691
641
guard let x = input. next ( ) else { return . emptyInput }
692
- if _fastPath ( ( x >> 11 ) != 0b1101_1 && x <= 0x10ffff ) {
693
- return . scalarValue ( UnicodeScalar ( x ) )
694
- } else {
695
- return . error
696
- }
642
+ // Check code unit is valid: not surrogate-reserved and within range.
643
+ guard _fastPath ( ( x >> 11 ) != 0b1101_1 && x <= 0x10ffff )
644
+ else { return . error }
645
+ // x is a valid scalar.
646
+ return . scalarValue ( UnicodeScalar ( _unchecked : x ) )
697
647
}
698
648
699
649
/// Encodes a Unicode scalar as a UTF-32 code unit by calling the given
0 commit comments