Skip to content

Commit 6e5097a

Browse files
committed
[stdlib] String: Move shouldBreak into _GraphemeBreakingState
This turns _GraphemeBreakingState into a more proper state machine, although it is only able to recognize breaks in the forward direction. The backward direction requires arbitrarily long lookback, and it currently remains in _StringGuts.
1 parent e562133 commit 6e5097a

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// This source file is part of the Swift.org open source project
44
//
5-
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
5+
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
66
// Licensed under Apache License v2.0 with Runtime Library Exception
77
//
88
// See https://swift.org/LICENSE.txt for license information
@@ -459,7 +459,7 @@ extension _StringGuts {
459459

460460
while true {
461461
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
462-
if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) {
462+
if state.shouldBreak(between: scalar, and: scalar2) {
463463
break
464464
}
465465
index = nextIndex
@@ -505,7 +505,7 @@ extension _StringGuts {
505505
}
506506
}
507507

508-
extension _StringGuts {
508+
extension _GraphemeBreakingState {
509509
// Return true if there is an extended grapheme cluster boundary between two
510510
// scalars, based on state information previously collected about preceding
511511
// scalars.
@@ -517,11 +517,9 @@ extension _StringGuts {
517517
//
518518
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
519519
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
520-
internal func shouldBreak(
520+
internal mutating func shouldBreak(
521521
between scalar1: Unicode.Scalar,
522-
and scalar2: Unicode.Scalar,
523-
at index: Int,
524-
with state: inout _GraphemeBreakingState
522+
and scalar2: Unicode.Scalar
525523
) -> Bool {
526524
// GB3
527525
if scalar1.value == 0xD, scalar2.value == 0xA {
@@ -545,8 +543,8 @@ extension _StringGuts {
545543
var enterIndicSequence = false
546544

547545
defer {
548-
state.isInEmojiSequence = enterEmojiSequence
549-
state.isInIndicSequence = enterIndicSequence
546+
self.isInEmojiSequence = enterEmojiSequence
547+
self.isInIndicSequence = enterIndicSequence
550548
}
551549

552550
switch (x, y) {
@@ -591,14 +589,14 @@ extension _StringGuts {
591589
// continue the grapheme cluster by combining more scalars later. If we're
592590
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
593591
// then that's a signal that it's the start of an emoji sequence.
594-
if state.isInEmojiSequence || x == .extendedPictographic {
592+
if self.isInEmojiSequence || x == .extendedPictographic {
595593
enterEmojiSequence = true
596594
}
597595

598596
// If we're currently in an indic sequence (or if our lhs is a linking
599597
// consonant), then this check and everything underneath ensures that
600598
// we continue being in one and may check if this extend is a Virama.
601-
if state.isInIndicSequence || scalar1._isLinkingConsonant {
599+
if self.isInIndicSequence || scalar1._isLinkingConsonant {
602600
if y == .extend {
603601
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
604602

@@ -611,7 +609,7 @@ extension _StringGuts {
611609
enterIndicSequence = true
612610

613611
if scalar2._isVirama {
614-
state.hasSeenVirama = true
612+
self.hasSeenVirama = true
615613
}
616614
}
617615

@@ -627,32 +625,34 @@ extension _StringGuts {
627625

628626
// GB11
629627
case (.zwj, .extendedPictographic):
630-
return !state.isInEmojiSequence
628+
return !self.isInEmojiSequence
631629

632630
// GB12 & GB13
633631
case (.regionalIndicator, .regionalIndicator):
634632
defer {
635-
state.shouldBreakRI.toggle()
633+
self.shouldBreakRI.toggle()
636634
}
637635

638-
return state.shouldBreakRI
636+
return self.shouldBreakRI
639637

640638
// GB999
641639
default:
642640
// GB9c
643641
if
644-
state.isInIndicSequence,
645-
state.hasSeenVirama,
642+
self.isInIndicSequence,
643+
self.hasSeenVirama,
646644
scalar2._isLinkingConsonant
647645
{
648-
state.hasSeenVirama = false
646+
self.hasSeenVirama = false
649647
return false
650648
}
651649

652650
return true
653651
}
654652
}
653+
}
655654

655+
extension _StringGuts {
656656
// Return true if there is an extended grapheme cluster boundary between two
657657
// scalars, with no previous knowledge about preceding scalars.
658658
//

0 commit comments

Comments
 (0)