Skip to content

Commit bbcec6c

Browse files
Refactor UTF8.decode for iterator nil guarantee
1 parent 4799994 commit bbcec6c

File tree

1 file changed

+35
-46
lines changed

1 file changed

+35
-46
lines changed

stdlib/public/core/Unicode.swift

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,6 @@ public struct UTF8 : UnicodeCodec {
155155
/// The number of bits in `_decodeBuffer` that are current filled.
156156
internal var _bitsInBuffer: UInt8 = 0
157157

158-
/// Whether we have exhausted the iterator. Note that this doesn't mean
159-
/// we are done decoding, as there might still be bytes left in the buffer.
160-
internal var _didExhaustIterator: Bool = false
161-
162158
/// Starts or continues decoding a UTF-8 sequence.
163159
///
164160
/// To decode a code unit sequence completely, call this method repeatedly
@@ -202,44 +198,40 @@ public struct UTF8 : UnicodeCodec {
202198
/// UTF sequence has been fully decoded.
203199
public mutating func decode<
204200
I : IteratorProtocol where I.Element == CodeUnit
205-
>(_ next: inout I) -> UnicodeDecodingResult {
206-
207-
refillBuffer: if !_didExhaustIterator {
208-
// Bufferless ASCII fastpath.
209-
if _fastPath(_bitsInBuffer == 0) {
210-
if let codeUnit = next.next() {
211-
if codeUnit & 0x80 == 0 {
212-
return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
213-
}
214-
// Non-ASCII, proceed to buffering mode.
215-
_decodeBuffer = UInt32(codeUnit)
216-
_bitsInBuffer = 8
217-
} else {
218-
_didExhaustIterator = true
219-
return .emptyInput
220-
}
221-
} else if (_decodeBuffer & 0x80 == 0) {
222-
// ASCII in buffer. We don't refill the buffer so we can return
223-
// to bufferless mode once we've exhausted it.
224-
break refillBuffer
201+
>(_ input: inout I) -> UnicodeDecodingResult {
202+
203+
// Bufferless ASCII fastpath.
204+
if _fastPath(_bitsInBuffer == 0) {
205+
guard let codeUnit = input.next() else { return .emptyInput }
206+
// ASCII, return immediately.
207+
if codeUnit & 0x80 == 0 {
208+
return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit)))
225209
}
226-
// Buffering mode.
227-
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
228-
_sanityCheck(_bitsInBuffer < 32)
229-
repeat {
230-
if let codeUnit = next.next() {
231-
// We use & 0x1f to make the compiler omit a bounds check branch.
232-
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
233-
_bitsInBuffer = _bitsInBuffer &+ 8
234-
} else {
235-
_didExhaustIterator = true
236-
if _bitsInBuffer == 0 { return .emptyInput }
237-
break // We still have some bytes left in our buffer.
238-
}
239-
} while _bitsInBuffer < 32
240-
} else if _bitsInBuffer == 0 {
241-
return .emptyInput
210+
// Non-ASCII, proceed to buffering mode.
211+
_decodeBuffer = UInt32(codeUnit)
212+
_bitsInBuffer = 8
213+
} else if (_decodeBuffer & 0x80 == 0) {
214+
// ASCII in buffer. We don't refill the buffer so we can return
215+
// to bufferless mode once we've exhausted it.
216+
let codeUnit = _decodeBuffer & 0xff
217+
_decodeBuffer >>= 8
218+
_bitsInBuffer = _bitsInBuffer &- 8
219+
return .scalarValue(UnicodeScalar(_unchecked: codeUnit))
242220
}
221+
// Buffering mode.
222+
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
223+
_sanityCheck(_bitsInBuffer < 32)
224+
repeat {
225+
if let codeUnit = input.next() {
226+
// We know _bitsInBuffer < 32 so we use `& 0x1f` (31) to make the
227+
// compiler omit a bounds check branch for the bitshift.
228+
_decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f))
229+
_bitsInBuffer = _bitsInBuffer &+ 8
230+
} else {
231+
if _bitsInBuffer == 0 { return .emptyInput }
232+
break // We still have some bytes left in our buffer.
233+
}
234+
} while _bitsInBuffer < 32
243235

244236
// Decode one unicode scalar.
245237
// Note our empty bytes are always 0x00, which is required for this call.
@@ -250,16 +242,13 @@ public struct UTF8 : UnicodeCodec {
250242
_sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer)
251243
// Swift doesn't allow shifts greater than or equal to the type width.
252244
// _decodeBuffer >>= UInt32(bitsConsumed) // >>= 32 crashes.
253-
// Mask with 0x3f to let the compiler omit the '>= 64' bounds check.
245+
// Mask with 0x3f (63) to let the compiler omit the '>= 64' bounds check.
254246
_decodeBuffer = UInt32(truncatingBitPattern:
255247
UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x3f))
256248
_bitsInBuffer = _bitsInBuffer &- bitsConsumed
257249

258-
if _fastPath(result != nil) {
259-
return .scalarValue(UnicodeScalar(_unchecked: result!))
260-
} else {
261-
return .error // Ill-formed UTF-8 code unit sequence.
262-
}
250+
guard _fastPath(result != nil) else { return .error }
251+
return .scalarValue(UnicodeScalar(_unchecked: result!))
263252
}
264253

265254
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB

0 commit comments

Comments
 (0)