Merge pull request #3452 from natecook1000/nc-utf8index-nocore

gribozavr · web-flow · commit 2bf19132deca · 2016-07-13T09:11:29.000-07:00
[stdlib] Remove _StringCore from UTF8View.Index
diff --git a/stdlib/public/core/StringUTF16.swift b/stdlib/public/core/StringUTF16.swift
@@ -407,7 +407,7 @@ extension String.UTF16View.Index {
       "Invalid String.UTF8Index for this UTF-16 view")
 
     // Detect positions that have no corresponding index.
-    if !utf8Index._isOnUnicodeScalarBoundary {
+    if !utf8Index._isOnUnicodeScalarBoundary(in: core) {
       return nil
     }
     _offset = utf8Index._coreIndex
diff --git a/stdlib/public/core/StringUTF8.swift b/stdlib/public/core/StringUTF8.swift
@@ -174,10 +174,10 @@ extension String {
 
     init(_ _core: _StringCore) {
       self._core = _core
-      self._endIndex = Index(_core, _core.endIndex, Index._emptyBuffer)
+      self._endIndex = Index(_coreIndex: _core.endIndex, Index._emptyBuffer)
       if _fastPath(_core.count != 0) {
         let (_, buffer) = _core._encodeSomeUTF8(from: 0)
-        self._startIndex = Index(_core, 0, buffer)
+        self._startIndex = Index(_coreIndex: 0, buffer)
       } else {
         self._startIndex = self._endIndex
       }
@@ -208,27 +208,42 @@ extension String {
     public struct Index : Comparable {
       internal typealias Buffer = _StringCore._UTF8Chunk
 
-      init(_ _core: _StringCore, _ _coreIndex: Int,
-           _ _buffer: Buffer) {
-        self._core = _core
+      init(_coreIndex: Int, _ _buffer: Buffer) {
         self._coreIndex = _coreIndex
         self._buffer = _buffer
-        _sanityCheck(_coreIndex >= 0)
-        _sanityCheck(_coreIndex <= _core.count)
       }
 
       /// True iff the index is at the end of its view or if the next
       /// byte begins a new UnicodeScalar.
-      internal var _isOnUnicodeScalarBoundary: Bool {
+      internal func _isOnUnicodeScalarBoundary(in core: _StringCore) -> Bool {
         let buffer = UInt32(truncatingBitPattern: _buffer)
         let (codePoint, _) = UTF8._decodeOne(buffer)
-        return codePoint != nil || _isAtEnd
+        return codePoint != nil || _isEndIndex(of: core)
       }
 
       /// True iff the index is at the end of its view
-      internal var _isAtEnd: Bool {
+      internal func _isEndIndex(of core: _StringCore) -> Bool {
         return _buffer == Index._emptyBuffer
-          && _coreIndex == _core.endIndex
+          && _coreIndex == core.endIndex
+      }
+
+      /// The number of UTF-8 code units remaining in the buffer before the
+      /// next unicode scalar value is reached. This simulates calling
+      /// `index(after: i)` until `i._coreIndex` is incremented, but doesn't
+      /// need a `_core` reference.
+      internal var _utf8ContinuationBytesUntilNextUnicodeScalar: Int {
+        var buffer = _buffer
+        var count = 0
+        
+        while true {
+          let currentUnit = UTF8.CodeUnit(truncatingBitPattern: buffer)
+          if currentUnit & 0b1100_0000 != 0b1000_0000 {
+            break
+          }
+          count += 1
+          buffer = Index._nextBuffer(after: buffer)
+        }
+        return count
       }
 
       /// The value of the buffer when it is empty
@@ -240,20 +255,19 @@ extension String {
       internal static var _bufferHiByte: Buffer {
         return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8)
       }
-
+      
       /// Consume a byte of the given buffer: shift out the low byte
       /// and put FF in the high byte
       internal static func _nextBuffer(after thisBuffer: Buffer) -> Buffer {
         return (thisBuffer >> 8) | _bufferHiByte
       }
 
-      /// The underlying buffer we're presenting as UTF-8
-      internal let _core: _StringCore
       /// The position of `self`, rounded up to the nearest unicode
       /// scalar boundary, in the underlying UTF-16.
       internal let _coreIndex: Int
-      /// If `self` is at the end of its `_core`, has the value `_endBuffer`.
-      /// Otherwise, the low byte contains the value of
+      /// If `self` is at the end of its `_core`, has the value `_emptyBuffer`.
+      /// Otherwise, the low byte contains the value of the UTF-8 code unit
+      /// at this position.
       internal let _buffer: Buffer
     }
 
@@ -282,31 +296,40 @@ extension String {
       // FIXME: swift-3-indexing-model: range check i?
       let currentUnit = UTF8.CodeUnit(truncatingBitPattern: i._buffer)
       let hiNibble = currentUnit >> 4
-      // Map the high nibble of the current code unit into the
-      // amount by which to increment the UTF-16 index.  Only when
-      // the high nibble is 1111 do we have a surrogate pair.
+
+      // Amounts to increment the UTF-16 index based on the high nibble of a
+      // UTF-8 code unit. If the high nibble is:
+      //
+      // - 0b0000-0b0111: U+0000...U+007F: increment the UTF-16 pointer by 1
+      // - 0b1000-0b1011: UTF-8 continuation byte, do not increment 
+      //                  the UTF-16 pointer
+      // - 0b1100-0b1110: U+0080...U+FFFF: increment the UTF-16 pointer by 1
+      // - 0b1111:        U+10000...U+1FFFFF: increment the UTF-16 pointer by 2
       let u16Increments = Int(bitPattern:
       // 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
          0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01)
+      
+      // Map the high nibble of the current code unit into the
+      // amount by which to increment the UTF-16 index.
       let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3
       let nextCoreIndex = i._coreIndex &+ increment
       let nextBuffer = Index._nextBuffer(after: i._buffer)
 
-      // if the nextBuffer is nonempty, we have all we need
+      // If the nextBuffer is nonempty, we have all we need
       if _fastPath(nextBuffer != Index._emptyBuffer) {
-        return Index(i._core, nextCoreIndex, nextBuffer)
+        return Index(_coreIndex: nextCoreIndex, nextBuffer)
       }
       // If the underlying UTF16 isn't exhausted, fill a new buffer
-      else if _fastPath(nextCoreIndex < i._core.endIndex) {
-        let (_, freshBuffer) = i._core._encodeSomeUTF8(from: nextCoreIndex)
-        return Index(_core, nextCoreIndex, freshBuffer)
+      else if _fastPath(nextCoreIndex < _core.endIndex) {
+        let (_, freshBuffer) = _core._encodeSomeUTF8(from: nextCoreIndex)
+        return Index(_coreIndex: nextCoreIndex, freshBuffer)
       }
       else {
         // Produce the endIndex
         _precondition(
-          nextCoreIndex == i._core.endIndex,
+          nextCoreIndex == _core.endIndex,
           "Can't increment past endIndex of String.UTF8View")
-        return Index(_core, nextCoreIndex, nextBuffer)
+        return Index(_coreIndex: nextCoreIndex, nextBuffer)
       }
     }
 
@@ -468,17 +491,19 @@ public func < (
   lhs: String.UTF8View.Index,
   rhs: String.UTF8View.Index
 ) -> Bool {
-  // FIXME: swift-3-indexing-model: tests.
-  // FIXME: swift-3-indexing-model: this implementation is wrong, it is just a
-  // temporary HACK.
+  if lhs._coreIndex == rhs._coreIndex && lhs._buffer != rhs._buffer {
+    // The index with more continuation bytes remaining before the next
+    return lhs._utf8ContinuationBytesUntilNextUnicodeScalar >
+      rhs._utf8ContinuationBytesUntilNextUnicodeScalar
+  }
   return lhs._coreIndex < rhs._coreIndex
 }
 
 // Index conversions
 extension String.UTF8View.Index {
   internal init(_ core: _StringCore, _utf16Offset: Int) {
       let (_, buffer) = core._encodeSomeUTF8(from: _utf16Offset)
-      self.init(core, _utf16Offset, buffer)
+      self.init(_coreIndex: _utf16Offset, buffer)
   }
 
   /// Creates an index in the given UTF-8 view that corresponds exactly to the
diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift
@@ -471,7 +471,7 @@ extension String.UnicodeScalarIndex {
       "Invalid String.UTF8Index for this UnicodeScalar view")
 
     // Detect positions that have no corresponding index.
-    if !utf8Index._isOnUnicodeScalarBoundary {
+    if !utf8Index._isOnUnicodeScalarBoundary(in: core) {
       return nil
     }
     self.init(_position: utf8Index._coreIndex)
diff --git a/validation-test/stdlib/StringViews.swift b/validation-test/stdlib/StringViews.swift
@@ -687,6 +687,14 @@ tests.test("UTF8 indexes") {
   }
 }
 
+tests.test("index/Comparable")
+  .forEach(in: [summer, winter]) { str in
+  checkComparable(str.characters.indices, oracle: <=>)
+  checkComparable(str.unicodeScalars.indices, oracle: <=>)
+  checkComparable(str.utf16.indices, oracle: <=>)
+  checkComparable(str.utf8.indices, oracle: <=>)
+}
+
 tests.test("UTF16->String") {
   let s = summer + winter + winter + summer
   let v = s.utf16

Original file line number	Diff line number	Diff line change
`@@ -407,7 +407,7 @@ extension String.UTF16View.Index {`
`407`	`407`	`"Invalid String.UTF8Index for this UTF-16 view")`
`408`	`408`
`409`	`409`	`// Detect positions that have no corresponding index.`
`410`		`- if !utf8Index._isOnUnicodeScalarBoundary {`
	`410`	`+ if !utf8Index._isOnUnicodeScalarBoundary(in: core) {`
`411`	`411`	`return nil`
`412`	`412`	`}`
`413`	`413`	`_offset = utf8Index._coreIndex`
Original file line number	Diff line number	Diff line change
`@@ -471,7 +471,7 @@ extension String.UnicodeScalarIndex {`
`471`	`471`	`"Invalid String.UTF8Index for this UnicodeScalar view")`
`472`	`472`
`473`	`473`	`// Detect positions that have no corresponding index.`
`474`		`- if !utf8Index._isOnUnicodeScalarBoundary {`
	`474`	`+ if !utf8Index._isOnUnicodeScalarBoundary(in: core) {`
`475`	`475`	`return nil`
`476`	`476`	`}`
`477`	`477`	`self.init(_position: utf8Index._coreIndex)`
Original file line number	Diff line number	Diff line change
`@@ -687,6 +687,14 @@ tests.test("UTF8 indexes") {`
`687`	`687`	`}`
`688`	`688`	`}`
`689`	`689`
	`690`	`+tests.test("index/Comparable")`
	`691`	`+ .forEach(in: [summer, winter]) { str in`
	`692`	`+ checkComparable(str.characters.indices, oracle: <=>)`
	`693`	`+ checkComparable(str.unicodeScalars.indices, oracle: <=>)`
	`694`	`+ checkComparable(str.utf16.indices, oracle: <=>)`
	`695`	`+ checkComparable(str.utf8.indices, oracle: <=>)`
	`696`	`+}`
	`697`	`+`
`690`	`698`	`tests.test("UTF16->String") {`
`691`	`699`	`let s = summer + winter + winter + summer`
`692`	`700`	`let v = s.utf16`