Merge pull request #3287 from PatrickPijnappel/utf-refactor

gribozavr · web-flow · commit bcb5a363d165 · 2016-07-08T14:07:15.000-07:00
[stdlib] Significant UTF8/16 decode speed-ups for iterator nil-guarantee
diff --git a/stdlib/public/core/Unicode.swift b/stdlib/public/core/Unicode.swift
@@ -101,15 +101,15 @@ public protocol UnicodeCodec {
   ///     print(scalars)
   ///     // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
   ///
-  /// - Parameter next: An iterator of code units to be decoded. `next` must be
+  /// - Parameter input: An iterator of code units to be decoded. `input` must be
   ///   the same iterator instance in repeated calls to this method. Do not
   ///   advance the iterator or any copies of the iterator outside this
   ///   method.
   /// - Returns: A `UnicodeDecodingResult` instance, representing the next
   ///   Unicode scalar, an indication of an error, or an indication that the
   ///   UTF sequence has been fully decoded.
   mutating func decode<I : IteratorProtocol>(
-    _ next: inout I
+    _ input: inout I
   ) -> UnicodeDecodingResult where I.Element == CodeUnit
 
   /// Encodes a Unicode scalar as a series of code units by calling the given
@@ -162,10 +162,6 @@ public struct UTF8 : UnicodeCodec {
   /// The number of bits in `_decodeBuffer` that are current filled.
   internal var _bitsInBuffer: UInt8 = 0
 
-  /// Whether we have exhausted the iterator.  Note that this doesn't mean
-  /// we are done decoding, as there might still be bytes left in the buffer.
-  internal var _didExhaustIterator: Bool = false
-
   /// Starts or continues decoding a UTF-8 sequence.
   ///
   /// To decode a code unit sequence completely, call this method repeatedly
@@ -200,53 +196,49 @@ public struct UTF8 : UnicodeCodec {
   ///     print(scalars)
   ///     // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
   ///
-  /// - Parameter next: An iterator of code units to be decoded. `next` must be
+  /// - Parameter input: An iterator of code units to be decoded. `input` must be
   ///   the same iterator instance in repeated calls to this method. Do not
   ///   advance the iterator or any copies of the iterator outside this
   ///   method.
   /// - Returns: A `UnicodeDecodingResult` instance, representing the next
   ///   Unicode scalar, an indication of an error, or an indication that the
   ///   UTF sequence has been fully decoded.
   public mutating func decode<I : IteratorProtocol>(
-    _ next: inout I
+    _ input: inout I
   ) -> UnicodeDecodingResult where I.Element == CodeUnit {
 
-    refillBuffer: if !_didExhaustIterator {
-      // Bufferless ASCII fastpath.
-      if _fastPath(_bitsInBuffer == 0) {
-        if let codeUnit = next.next() {
-          if codeUnit & 0x80 == 0 {
-            return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
-          }
-          // Non-ASCII, proceed to buffering mode.
-          _decodeBuffer = UInt32(codeUnit)
-          _bitsInBuffer = 8
-        } else {
-          _didExhaustIterator = true
-          return .emptyInput
-        }
-      } else if (_decodeBuffer & 0x80 == 0) {
-        // ASCII in buffer.  We don't refill the buffer so we can return
-        // to bufferless mode once we've exhausted it.
-        break refillBuffer
+    // Bufferless ASCII fastpath.
+    if _fastPath(_bitsInBuffer == 0) {
+      guard let codeUnit = input.next() else { return .emptyInput }
+      // ASCII, return immediately.
+      if codeUnit & 0x80 == 0 {
+        return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
       }
-      // Buffering mode.
-      // Fill buffer back to 4 bytes (or as many as are left in the iterator).
-      _sanityCheck(_bitsInBuffer < 32)
-      repeat {
-        if let codeUnit = next.next() {
-          // We use & 0x1f to make the compiler omit a bounds check branch.
-          _decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
-          _bitsInBuffer = _bitsInBuffer &+ 8
-        } else {
-          _didExhaustIterator = true
-          if _bitsInBuffer == 0 { return .emptyInput }
-          break // We still have some bytes left in our buffer.
-        }
-      } while _bitsInBuffer < 32
-    } else if _bitsInBuffer == 0 {
-      return .emptyInput
+      // Non-ASCII, proceed to buffering mode.
+      _decodeBuffer = UInt32(codeUnit)
+      _bitsInBuffer = 8
+    } else if (_decodeBuffer & 0x80 == 0) {
+      // ASCII in buffer.  We don't refill the buffer so we can return
+      // to bufferless mode once we've exhausted it.
+      let codeUnit = _decodeBuffer & 0xff
+      _decodeBuffer >>= 8
+      _bitsInBuffer = _bitsInBuffer &- 8
+      return .scalarValue(UnicodeScalar(_unchecked: codeUnit))
     }
+    // Buffering mode.
+    // Fill buffer back to 4 bytes (or as many as are left in the iterator).
+    _sanityCheck(_bitsInBuffer < 32)
+    repeat {
+      if let codeUnit = input.next() {
+        // We know _bitsInBuffer < 32 so we use `& 0x1f` (31) to make the
+        // compiler omit a bounds check branch for the bitshift.
+        _decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
+        _bitsInBuffer = _bitsInBuffer &+ 8
+      } else {
+        if _bitsInBuffer == 0 { return .emptyInput }
+        break // We still have some bytes left in our buffer.
+      }
+    } while _bitsInBuffer < 32
 
     // Decode one unicode scalar.
     // Note our empty bytes are always 0x00, which is required for this call.
@@ -257,16 +249,13 @@ public struct UTF8 : UnicodeCodec {
     _sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer)
     // Swift doesn't allow shifts greater than or equal to the type width.
     // _decodeBuffer >>= UInt32(bitsConsumed) // >>= 32 crashes.
-    // Mask with 0x3f to let the compiler omit the '>= 64' bounds check.
+    // Mask with 0x3f (63) to let the compiler omit the '>= 64' bounds check.
     _decodeBuffer = UInt32(truncatingBitPattern:
       UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x3f))
     _bitsInBuffer = _bitsInBuffer &- bitsConsumed
 
-    if _fastPath(result != nil) {
-      return .scalarValue(UnicodeScalar(_unchecked: result!))
-    } else {
-      return .error // Ill-formed UTF-8 code unit sequence.
-    }
+    guard _fastPath(result != nil) else { return .error }
+    return .scalarValue(UnicodeScalar(_unchecked: result!))
   }
 
   /// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
@@ -451,14 +440,7 @@ public struct UTF16 : UnicodeCodec {
   public init() {}
 
   /// A lookahead buffer for one UTF-16 code unit.
-  internal var _decodeLookahead: UInt32 = 0
-
-  /// Flags with layout: `0b0000_00xy`.
-  ///
-  /// `y` is the EOF flag.
-  ///
-  /// `x` is set when `_decodeLookahead` contains a code unit.
-  internal var _lookaheadFlags: UInt8 = 0
+  internal var _decodeLookahead: UInt16?
 
   /// Starts or continues decoding a UTF-16 sequence.
   ///
@@ -494,7 +476,7 @@ public struct UTF16 : UnicodeCodec {
   ///     print(scalars)
   ///     // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
   ///
-  /// - Parameter next: An iterator of code units to be decoded. `next` must be
+  /// - Parameter input: An iterator of code units to be decoded. `input` must be
   ///   the same iterator instance in repeated calls to this method. Do not
   ///   advance the iterator or any copies of the iterator outside this
   ///   method.
@@ -504,76 +486,44 @@ public struct UTF16 : UnicodeCodec {
   public mutating func decode<I : IteratorProtocol>(
     _ input: inout I
   ) -> UnicodeDecodingResult where I.Element == CodeUnit {
-    if _lookaheadFlags & 0b01 != 0 {
-      return .emptyInput
-    }
-
     // Note: maximal subpart of ill-formed sequence for UTF-16 can only have
     // length 1.  Length 0 does not make sense.  Neither does length 2 -- in
     // that case the sequence is valid.
 
-    var unit0: UInt32
-    if _fastPath(_lookaheadFlags & 0b10 == 0) {
-      if let first = input.next() {
-        unit0 = UInt32(first)
-      } else {
-        // Set EOF flag.
-        _lookaheadFlags |= 0b01
-        return .emptyInput
-      }
-    } else {
-      // Fetch code unit from the lookahead buffer and note this fact in flags.
-      unit0 = _decodeLookahead
-      _lookaheadFlags &= 0b01
+    let unit0: UInt16
+    if _fastPath(_decodeLookahead == nil) {
+      guard let next = input.next() else { return .emptyInput }
+      unit0 = next
+    } else { // Consume lookahead first.
+      unit0 = _decodeLookahead!
+      _decodeLookahead = nil
     }
 
     // A well-formed pair of surrogates looks like this:
-    // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
+    //     high-surrogate        low-surrogate
+    // [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
 
+    // Common case first, non-surrogate -- just a sequence of 1 code unit.
     if _fastPath((unit0 >> 11) != 0b1101_1) {
-      // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
-      // decoding is trivial.
-      return .scalarValue(UnicodeScalar(unit0))
-    }
-
-    if _slowPath((unit0 >> 10) == 0b1101_11) {
-      // `unit0` is a low-surrogate.  We have an ill-formed sequence.
-      return .error
+      return .scalarValue(UnicodeScalar(_unchecked: UInt32(unit0)))
     }
 
-    // At this point we know that `unit0` is a high-surrogate.
+    // Ensure `unit0` is a high-surrogate.
+    guard _fastPath((unit0 >> 10) == 0b1101_10) else { return .error }
 
-    var unit1: UInt32
-    if let second = input.next() {
-      unit1 = UInt32(second)
-    } else {
-      // EOF reached.  Set EOF flag.
-      _lookaheadFlags |= 0b01
+    // We already have a high-surrogate, so there should be a next code unit.
+    guard let unit1 = input.next() else { return .error }
 
-      // We have seen a high-surrogate and EOF, so we have an ill-formed
-      // sequence.
+    // `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
+    guard _fastPath((unit1 >> 10) == 0b1101_11) else {
+      // Invalid sequence, discard `unit0` and store `unit1` for the next call.
+      _decodeLookahead = unit1
       return .error
     }
 
-    if _fastPath((unit1 >> 10) == 0b1101_11) {
-      // `unit1` is a low-surrogate.  We have a well-formed surrogate pair.
-
-      let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
-      return .scalarValue(UnicodeScalar(result))
-    }
-
-    // Otherwise, we have an ill-formed sequence.  These are the possible
-    // cases:
-    //
-    // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
-    //
-    // * `unit1` is not a surrogate.  We have an ill-formed sequence:
-    //   high-surrogate followed by a non-surrogate.
-
-    // Save the second code unit in the lookahead buffer.
-    _decodeLookahead = unit1
-    _lookaheadFlags |= 0b10
-    return .error
+    // We have a well-formed surrogate pair, decode it.
+    let result = 0x10000 + ((UInt32(unit0 & 0x03ff) << 10) | UInt32(unit1 & 0x03ff))
+    return .scalarValue(UnicodeScalar(_unchecked: result))
   }
 
   /// Try to decode one Unicode scalar, and return the actual number of code
@@ -672,7 +622,7 @@ public struct UTF32 : UnicodeCodec {
   ///     print(scalars)
   ///     // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
   ///
-  /// - Parameter next: An iterator of code units to be decoded. `next` must be
+  /// - Parameter input: An iterator of code units to be decoded. `input` must be
   ///   the same iterator instance in repeated calls to this method. Do not
   ///   advance the iterator or any copies of the iterator outside this
   ///   method.
@@ -689,11 +639,11 @@ public struct UTF32 : UnicodeCodec {
     _ input: inout I
   ) -> UnicodeDecodingResult where I.Element == CodeUnit {
     guard let x = input.next() else { return .emptyInput }
-    if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) {
-      return .scalarValue(UnicodeScalar(x))
-    } else {
-      return .error
-    }
+    // Check code unit is valid: not surrogate-reserved and within range.
+    guard _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff)
+      else { return .error }
+    // x is a valid scalar.
+    return .scalarValue(UnicodeScalar(_unchecked: x))
   }
 
   /// Encodes a Unicode scalar as a UTF-32 code unit by calling the given
diff --git a/validation-test/stdlib/Unicode.swift.gyb b/validation-test/stdlib/Unicode.swift.gyb
@@ -88,40 +88,20 @@ UTF16APIs.test("trailSurrogate/trap/U+FFFF") {
   _ = UTF16.trailSurrogate(us)
 }
 
-class EOFCountingIterator<T> : IteratorProtocol {
-  var array: [T]
-  var index: Int = 0
-  var numTimesReturnedEOF: Int = 0
-
-  init(_ array: [T]) {
-    self.array = array
-  }
-
-  func next() -> T? {
-    if index == array.count {
-      numTimesReturnedEOF += 1
-      return .none
-    }
-    index += 1
-    return array[index - 1]
-  }
-}
-
 func checkDecodeUTF<Codec : UnicodeCodec>(
     _ codec: Codec.Type, _ expectedHead: [UInt32],
     _ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
 ) -> AssertionResult {
   do {
     var decoded = [UInt32]()
     let output: (UInt32) -> Void = { decoded.append($0) }
-    let iterator = EOFCountingIterator(utfStr)
+    let iterator = utfStr.makeIterator()
     transcode(
       iterator,
       from: codec,
       to: UTF32.self,
       stoppingOnError: true,
       sendingOutputTo: output)
-    expectGE(1, iterator.numTimesReturnedEOF)
     if expectedHead != decoded {
       return assertionFailure()
           .withDescription("\n")
@@ -136,14 +116,13 @@ func checkDecodeUTF<Codec : UnicodeCodec>(
 
     var decoded = [UInt32]()
     let output: (UInt32) -> Void = { decoded.append($0) }
-    let iterator = EOFCountingIterator(utfStr)
+    let iterator = utfStr.makeIterator()
     transcode(
       iterator,
       from: codec,
       to: UTF32.self,
       stoppingOnError: false,
       sendingOutputTo: output)
-    expectEqual(1, iterator.numTimesReturnedEOF)
     if expected != decoded {
       return assertionFailure()
           .withDescription("\n")
@@ -182,15 +161,14 @@ func checkEncodeUTF8(_ expected: [UInt8],
                      _ scalars: [UInt32]) -> AssertionResult {
   var encoded = [UInt8]()
   let output: (UInt8) -> Void = { encoded.append($0) }
-  let iterator = EOFCountingIterator(scalars)
+  let iterator = scalars.makeIterator()
   let hadError = transcode(
     iterator,
     from: UTF32.self,
     to: UTF8.self,
     stoppingOnError: true,
     sendingOutputTo: output)
   expectFalse(hadError)
-  expectGE(1, iterator.numTimesReturnedEOF)
   if expected != encoded {
     return assertionFailure()
         .withDescription("\n")