Skip to content

Commit bcb5a36

Browse files
authored
Merge pull request #3287 from PatrickPijnappel/utf-refactor
[stdlib] Significant UTF8/16 decode speed-ups for iterator nil-guarantee
2 parents c41d064 + d4470aa commit bcb5a36

File tree

2 files changed

+70
-142
lines changed

2 files changed

+70
-142
lines changed

stdlib/public/core/Unicode.swift

Lines changed: 67 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,15 @@ public protocol UnicodeCodec {
101101
/// print(scalars)
102102
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
103103
///
104-
/// - Parameter next: An iterator of code units to be decoded. `next` must be
104+
/// - Parameter input: An iterator of code units to be decoded. `input` must be
105105
/// the same iterator instance in repeated calls to this method. Do not
106106
/// advance the iterator or any copies of the iterator outside this
107107
/// method.
108108
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
109109
/// Unicode scalar, an indication of an error, or an indication that the
110110
/// UTF sequence has been fully decoded.
111111
mutating func decode<I : IteratorProtocol>(
112-
_ next: inout I
112+
_ input: inout I
113113
) -> UnicodeDecodingResult where I.Element == CodeUnit
114114

115115
/// Encodes a Unicode scalar as a series of code units by calling the given
@@ -162,10 +162,6 @@ public struct UTF8 : UnicodeCodec {
162162
/// The number of bits in `_decodeBuffer` that are current filled.
163163
internal var _bitsInBuffer: UInt8 = 0
164164

165-
/// Whether we have exhausted the iterator. Note that this doesn't mean
166-
/// we are done decoding, as there might still be bytes left in the buffer.
167-
internal var _didExhaustIterator: Bool = false
168-
169165
/// Starts or continues decoding a UTF-8 sequence.
170166
///
171167
/// To decode a code unit sequence completely, call this method repeatedly
@@ -200,53 +196,49 @@ public struct UTF8 : UnicodeCodec {
200196
/// print(scalars)
201197
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
202198
///
203-
/// - Parameter next: An iterator of code units to be decoded. `next` must be
199+
/// - Parameter input: An iterator of code units to be decoded. `input` must be
204200
/// the same iterator instance in repeated calls to this method. Do not
205201
/// advance the iterator or any copies of the iterator outside this
206202
/// method.
207203
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
208204
/// Unicode scalar, an indication of an error, or an indication that the
209205
/// UTF sequence has been fully decoded.
210206
public mutating func decode<I : IteratorProtocol>(
211-
_ next: inout I
207+
_ input: inout I
212208
) -> UnicodeDecodingResult where I.Element == CodeUnit {
213209

214-
refillBuffer: if !_didExhaustIterator {
215-
// Bufferless ASCII fastpath.
216-
if _fastPath(_bitsInBuffer == 0) {
217-
if let codeUnit = next.next() {
218-
if codeUnit & 0x80 == 0 {
219-
return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
220-
}
221-
// Non-ASCII, proceed to buffering mode.
222-
_decodeBuffer = UInt32(codeUnit)
223-
_bitsInBuffer = 8
224-
} else {
225-
_didExhaustIterator = true
226-
return .emptyInput
227-
}
228-
} else if (_decodeBuffer & 0x80 == 0) {
229-
// ASCII in buffer. We don't refill the buffer so we can return
230-
// to bufferless mode once we've exhausted it.
231-
break refillBuffer
210+
// Bufferless ASCII fastpath.
211+
if _fastPath(_bitsInBuffer == 0) {
212+
guard let codeUnit = input.next() else { return .emptyInput }
213+
// ASCII, return immediately.
214+
if codeUnit & 0x80 == 0 {
215+
return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
232216
}
233-
// Buffering mode.
234-
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
235-
_sanityCheck(_bitsInBuffer < 32)
236-
repeat {
237-
if let codeUnit = next.next() {
238-
// We use & 0x1f to make the compiler omit a bounds check branch.
239-
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
240-
_bitsInBuffer = _bitsInBuffer &+ 8
241-
} else {
242-
_didExhaustIterator = true
243-
if _bitsInBuffer == 0 { return .emptyInput }
244-
break // We still have some bytes left in our buffer.
245-
}
246-
} while _bitsInBuffer < 32
247-
} else if _bitsInBuffer == 0 {
248-
return .emptyInput
217+
// Non-ASCII, proceed to buffering mode.
218+
_decodeBuffer = UInt32(codeUnit)
219+
_bitsInBuffer = 8
220+
} else if (_decodeBuffer & 0x80 == 0) {
221+
// ASCII in buffer. We don't refill the buffer so we can return
222+
// to bufferless mode once we've exhausted it.
223+
let codeUnit = _decodeBuffer & 0xff
224+
_decodeBuffer >>= 8
225+
_bitsInBuffer = _bitsInBuffer &- 8
226+
return .scalarValue(UnicodeScalar(_unchecked: codeUnit))
249227
}
228+
// Buffering mode.
229+
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
230+
_sanityCheck(_bitsInBuffer < 32)
231+
repeat {
232+
if let codeUnit = input.next() {
233+
// We know _bitsInBuffer < 32 so we use `& 0x1f` (31) to make the
234+
// compiler omit a bounds check branch for the bitshift.
235+
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
236+
_bitsInBuffer = _bitsInBuffer &+ 8
237+
} else {
238+
if _bitsInBuffer == 0 { return .emptyInput }
239+
break // We still have some bytes left in our buffer.
240+
}
241+
} while _bitsInBuffer < 32
250242

251243
// Decode one unicode scalar.
252244
// Note our empty bytes are always 0x00, which is required for this call.
@@ -257,16 +249,13 @@ public struct UTF8 : UnicodeCodec {
257249
_sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer)
258250
// Swift doesn't allow shifts greater than or equal to the type width.
259251
// _decodeBuffer >>= UInt32(bitsConsumed) // >>= 32 crashes.
260-
// Mask with 0x3f to let the compiler omit the '>= 64' bounds check.
252+
// Mask with 0x3f (63) to let the compiler omit the '>= 64' bounds check.
261253
_decodeBuffer = UInt32(truncatingBitPattern:
262254
UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x3f))
263255
_bitsInBuffer = _bitsInBuffer &- bitsConsumed
264256

265-
if _fastPath(result != nil) {
266-
return .scalarValue(UnicodeScalar(_unchecked: result!))
267-
} else {
268-
return .error // Ill-formed UTF-8 code unit sequence.
269-
}
257+
guard _fastPath(result != nil) else { return .error }
258+
return .scalarValue(UnicodeScalar(_unchecked: result!))
270259
}
271260

272261
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
@@ -451,14 +440,7 @@ public struct UTF16 : UnicodeCodec {
451440
public init() {}
452441

453442
/// A lookahead buffer for one UTF-16 code unit.
454-
internal var _decodeLookahead: UInt32 = 0
455-
456-
/// Flags with layout: `0b0000_00xy`.
457-
///
458-
/// `y` is the EOF flag.
459-
///
460-
/// `x` is set when `_decodeLookahead` contains a code unit.
461-
internal var _lookaheadFlags: UInt8 = 0
443+
internal var _decodeLookahead: UInt16?
462444

463445
/// Starts or continues decoding a UTF-16 sequence.
464446
///
@@ -494,7 +476,7 @@ public struct UTF16 : UnicodeCodec {
494476
/// print(scalars)
495477
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
496478
///
497-
/// - Parameter next: An iterator of code units to be decoded. `next` must be
479+
/// - Parameter input: An iterator of code units to be decoded. `input` must be
498480
/// the same iterator instance in repeated calls to this method. Do not
499481
/// advance the iterator or any copies of the iterator outside this
500482
/// method.
@@ -504,76 +486,44 @@ public struct UTF16 : UnicodeCodec {
504486
public mutating func decode<I : IteratorProtocol>(
505487
_ input: inout I
506488
) -> UnicodeDecodingResult where I.Element == CodeUnit {
507-
if _lookaheadFlags & 0b01 != 0 {
508-
return .emptyInput
509-
}
510-
511489
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
512490
// length 1. Length 0 does not make sense. Neither does length 2 -- in
513491
// that case the sequence is valid.
514492

515-
var unit0: UInt32
516-
if _fastPath(_lookaheadFlags & 0b10 == 0) {
517-
if let first = input.next() {
518-
unit0 = UInt32(first)
519-
} else {
520-
// Set EOF flag.
521-
_lookaheadFlags |= 0b01
522-
return .emptyInput
523-
}
524-
} else {
525-
// Fetch code unit from the lookahead buffer and note this fact in flags.
526-
unit0 = _decodeLookahead
527-
_lookaheadFlags &= 0b01
493+
let unit0: UInt16
494+
if _fastPath(_decodeLookahead == nil) {
495+
guard let next = input.next() else { return .emptyInput }
496+
unit0 = next
497+
} else { // Consume lookahead first.
498+
unit0 = _decodeLookahead!
499+
_decodeLookahead = nil
528500
}
529501

530502
// A well-formed pair of surrogates looks like this:
531-
// [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
503+
// high-surrogate low-surrogate
504+
// [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
532505

506+
// Common case first, non-surrogate -- just a sequence of 1 code unit.
533507
if _fastPath((unit0 >> 11) != 0b1101_1) {
534-
// Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
535-
// decoding is trivial.
536-
return .scalarValue(UnicodeScalar(unit0))
537-
}
538-
539-
if _slowPath((unit0 >> 10) == 0b1101_11) {
540-
// `unit0` is a low-surrogate. We have an ill-formed sequence.
541-
return .error
508+
return .scalarValue(UnicodeScalar(_unchecked: UInt32(unit0)))
542509
}
543510

544-
// At this point we know that `unit0` is a high-surrogate.
511+
// Ensure `unit0` is a high-surrogate.
512+
guard _fastPath((unit0 >> 10) == 0b1101_10) else { return .error }
545513

546-
var unit1: UInt32
547-
if let second = input.next() {
548-
unit1 = UInt32(second)
549-
} else {
550-
// EOF reached. Set EOF flag.
551-
_lookaheadFlags |= 0b01
514+
// We already have a high-surrogate, so there should be a next code unit.
515+
guard let unit1 = input.next() else { return .error }
552516

553-
// We have seen a high-surrogate and EOF, so we have an ill-formed
554-
// sequence.
517+
// `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
518+
guard _fastPath((unit1 >> 10) == 0b1101_11) else {
519+
// Invalid sequence, discard `unit0` and store `unit1` for the next call.
520+
_decodeLookahead = unit1
555521
return .error
556522
}
557523

558-
if _fastPath((unit1 >> 10) == 0b1101_11) {
559-
// `unit1` is a low-surrogate. We have a well-formed surrogate pair.
560-
561-
let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
562-
return .scalarValue(UnicodeScalar(result))
563-
}
564-
565-
// Otherwise, we have an ill-formed sequence. These are the possible
566-
// cases:
567-
//
568-
// * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
569-
//
570-
// * `unit1` is not a surrogate. We have an ill-formed sequence:
571-
// high-surrogate followed by a non-surrogate.
572-
573-
// Save the second code unit in the lookahead buffer.
574-
_decodeLookahead = unit1
575-
_lookaheadFlags |= 0b10
576-
return .error
524+
// We have a well-formed surrogate pair, decode it.
525+
let result = 0x10000 + ((UInt32(unit0 & 0x03ff) << 10) | UInt32(unit1 & 0x03ff))
526+
return .scalarValue(UnicodeScalar(_unchecked: result))
577527
}
578528

579529
/// Try to decode one Unicode scalar, and return the actual number of code
@@ -672,7 +622,7 @@ public struct UTF32 : UnicodeCodec {
672622
/// print(scalars)
673623
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
674624
///
675-
/// - Parameter next: An iterator of code units to be decoded. `next` must be
625+
/// - Parameter input: An iterator of code units to be decoded. `input` must be
676626
/// the same iterator instance in repeated calls to this method. Do not
677627
/// advance the iterator or any copies of the iterator outside this
678628
/// method.
@@ -689,11 +639,11 @@ public struct UTF32 : UnicodeCodec {
689639
_ input: inout I
690640
) -> UnicodeDecodingResult where I.Element == CodeUnit {
691641
guard let x = input.next() else { return .emptyInput }
692-
if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) {
693-
return .scalarValue(UnicodeScalar(x))
694-
} else {
695-
return .error
696-
}
642+
// Check code unit is valid: not surrogate-reserved and within range.
643+
guard _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff)
644+
else { return .error }
645+
// x is a valid scalar.
646+
return .scalarValue(UnicodeScalar(_unchecked: x))
697647
}
698648

699649
/// Encodes a Unicode scalar as a UTF-32 code unit by calling the given

validation-test/stdlib/Unicode.swift.gyb

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -88,40 +88,20 @@ UTF16APIs.test("trailSurrogate/trap/U+FFFF") {
8888
_ = UTF16.trailSurrogate(us)
8989
}
9090

91-
class EOFCountingIterator<T> : IteratorProtocol {
92-
var array: [T]
93-
var index: Int = 0
94-
var numTimesReturnedEOF: Int = 0
95-
96-
init(_ array: [T]) {
97-
self.array = array
98-
}
99-
100-
func next() -> T? {
101-
if index == array.count {
102-
numTimesReturnedEOF += 1
103-
return .none
104-
}
105-
index += 1
106-
return array[index - 1]
107-
}
108-
}
109-
11091
func checkDecodeUTF<Codec : UnicodeCodec>(
11192
_ codec: Codec.Type, _ expectedHead: [UInt32],
11293
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
11394
) -> AssertionResult {
11495
do {
11596
var decoded = [UInt32]()
11697
let output: (UInt32) -> Void = { decoded.append($0) }
117-
let iterator = EOFCountingIterator(utfStr)
98+
let iterator = utfStr.makeIterator()
11899
transcode(
119100
iterator,
120101
from: codec,
121102
to: UTF32.self,
122103
stoppingOnError: true,
123104
sendingOutputTo: output)
124-
expectGE(1, iterator.numTimesReturnedEOF)
125105
if expectedHead != decoded {
126106
return assertionFailure()
127107
.withDescription("\n")
@@ -136,14 +116,13 @@ func checkDecodeUTF<Codec : UnicodeCodec>(
136116

137117
var decoded = [UInt32]()
138118
let output: (UInt32) -> Void = { decoded.append($0) }
139-
let iterator = EOFCountingIterator(utfStr)
119+
let iterator = utfStr.makeIterator()
140120
transcode(
141121
iterator,
142122
from: codec,
143123
to: UTF32.self,
144124
stoppingOnError: false,
145125
sendingOutputTo: output)
146-
expectEqual(1, iterator.numTimesReturnedEOF)
147126
if expected != decoded {
148127
return assertionFailure()
149128
.withDescription("\n")
@@ -182,15 +161,14 @@ func checkEncodeUTF8(_ expected: [UInt8],
182161
_ scalars: [UInt32]) -> AssertionResult {
183162
var encoded = [UInt8]()
184163
let output: (UInt8) -> Void = { encoded.append($0) }
185-
let iterator = EOFCountingIterator(scalars)
164+
let iterator = scalars.makeIterator()
186165
let hadError = transcode(
187166
iterator,
188167
from: UTF32.self,
189168
to: UTF8.self,
190169
stoppingOnError: true,
191170
sendingOutputTo: output)
192171
expectFalse(hadError)
193-
expectGE(1, iterator.numTimesReturnedEOF)
194172
if expected != encoded {
195173
return assertionFailure()
196174
.withDescription("\n")

0 commit comments

Comments
 (0)