[stdlib] Add new SPI for grapheme breaking (outside String)

lorentey · lorentey · commit 699a3f0ee509 · 2023-01-04T23:58:33.000-08:00
`Unicode._CharacterRecognizer` is a newly exported opaque type that
exposes the stdlib’s extended grapheme cluster breaking facility,
independent of `String`.

This essentially makes the underlying simple state machine public,
without exposing any of the (unstable) Unicode details.

The ability to perform grapheme breaking over, say, the scalars stored
in multiple `String` values can be extremely useful while building
custom text processing algorithms and data structures.

Ideally this would eventually become API, but before proposing this
to Swift Evolution, I’d like to prove the shape of the type in actual
use (and we’ll also need to find better names for its operations).
diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift
@@ -436,6 +436,70 @@ internal struct _GraphemeBreakingState {
   var shouldBreakRI = false
 }
 
+extension Unicode {
+  /// A state machine for recognizing `Character` (i.e., extended grapheme
+  /// cluster) boundaries in an arbitrary series of Unicode scalars.
+  ///
+  /// The recognizer needs to be initialized with the first scalar in the
+  /// series. Subsequent scalars must then be fed one by one to the
+  /// `hasCharacterBoundary(before:)` method, which returns a Boolean value
+  /// indicating whether the given scalar starts a new `Character`.
+  ///
+  /// The results produced by this state machine are guaranteed to match the way
+  /// `String` splits its contents into `Character` values.
+  @available(SwiftStdlib 5.8, *)
+  public // SPI(Foundation) FIXME: We need API for this
+  struct _CharacterRecognizer {
+    internal var _previous: Unicode.Scalar
+    internal var _state: _GraphemeBreakingState
+
+    /// Returns a non-nil value if it can be determined whether there is a
+    /// `Character` break between `scalar1` and `scalar2` without knowing
+    /// anything about the scalars that precede `scalar1`. This can be used as a
+    /// fast (but incomplete) test before spinning up a full state machine
+    /// session.
+    @_effects(releasenone)
+    public static func quickBreak(
+      between scalar1: Unicode.Scalar,
+      and scalar2: Unicode.Scalar
+    ) -> Bool? {
+      if scalar1.value == 0xD, scalar2.value == 0xA {
+        return false
+      }
+      if _hasGraphemeBreakBetween(scalar1, scalar2) {
+        return true
+      }
+      return nil
+    }
+
+    /// Initialize a new `Character` recognizer, feeding it the given value as
+    /// the first Unicode scalar in the series. The state machine assumes that
+    /// `first` is supposed to start a new extended grapheme cluster.
+    public init(first: Unicode.Scalar) {
+      _state = _GraphemeBreakingState()
+      _previous = first
+    }
+
+    /// Feeds the next scalar to the state machine, returning a Boolean value
+    /// indicating whether it starts a new `Character`.
+    ///
+    /// The state machine does not carry information across `Character`
+    /// boundaries. I.e., if this method returns true, then `self` after the
+    /// call is equivalent to `_CharacterRecognizer(first: next)`.
+    @_effects(releasenone)
+    public mutating func hasCharacterBoundary(
+      before next: Unicode.Scalar
+    ) -> Bool {
+      let r = _state.shouldBreak(between: _previous, and: next)
+      if r {
+        _state = _GraphemeBreakingState()
+      }
+      _previous = next
+      return r
+    }
+  }
+}
+
 extension _StringGuts {
   // Returns the stride of the grapheme cluster starting at offset `index`,
   // assuming it is on a grapheme cluster boundary.
diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift
@@ -0,0 +1,76 @@
+// RUN: %empty-directory(%t)
+// RUN: %target-run-stdlib-swift %S/Inputs/
+
+// REQUIRES: executable_test
+// REQUIRES: objc_interop
+// REQUIRES: optimized_stdlib
+
+import Swift
+import StdlibUnittest
+import StdlibUnicodeUnittest
+
+var suite = TestSuite("CharacterRecognizer")
+defer { runAllTests() }
+
+if #available(SwiftStdlib 5.8, *) {
+  suite.test("Unicode test data") {
+    for test in graphemeBreakTests {
+      var it = test.string.unicodeScalars.makeIterator()
+      guard let first = it.next() else { continue }
+      var recognizer = Unicode._CharacterRecognizer(first: first)
+      var pieces: [[Unicode.Scalar]] = []
+      var piece: [Unicode.Scalar] = [first]
+      while let next = it.next() {
+        if recognizer.hasCharacterBoundary(before: next) {
+          pieces.append(piece)
+          piece = [next]
+        } else {
+          piece.append(next)
+        }
+      }
+      if !piece.isEmpty { pieces.append(piece) }
+      expectEqual(pieces, test.pieces,
+        "string: \(String(reflecting: test.string))")
+    }
+  }
+}
+
+if #available(SwiftStdlib 5.8, *) {
+  suite.test("Consistency with Swift String's behavior") {
+    let sampleString = #"""
+    The powerful programming language that is also easy to learn.
+    손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
+    🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
+    some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d
+    Unicode is such fun!
+    U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵
+    U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚
+    U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\#
+    n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\#
+    ḭ̸̦̺̺͉̳͎́͑\#
+    c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\#
+    ö̶̱̠̱̤̙͚͖̳̜̰̹̖̣̻͎͉̞̫̬̯͕̝͔̝̟̘͔̙̪̭̲́̆̂͑̌͂̉̀̓́̏̎̋͗͛͆̌̽͌̄̎̚͝͝͝͝ͅ\#
+    d̶̨̨̡̡͙̟͉̱̗̝͙͍̮͍̘̮͔͑\#
+    e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ
+    """#
+
+    let expectedBreaks = Array(sampleString.indices)
+
+    let u = sampleString.unicodeScalars
+    var i = u.startIndex
+    var actualBreaks = [i]
+    var recognizer = Unicode._CharacterRecognizer(first: u[i])
+    u.formIndex(after: &i)
+    while i < u.endIndex {
+      if recognizer.hasCharacterBoundary(before: u[i]) {
+        actualBreaks.append(i)
+      }
+      u.formIndex(after: &i)
+    }
+    expectEqual(actualBreaks, expectedBreaks,
+      """
+      actualBreaks: \(actualBreaks.map { $0._description })
+      expectedBreaks: \(expectedBreaks.map { $0._description })
+      """)
+  }
+}