Skip to content

Commit 3d74633

Browse files
authored
Merge pull request swiftlang#9024 from apple/stateful-unicode-decoding
2 parents 9899526 + 1753e66 commit 3d74633

File tree

1 file changed

+73
-74
lines changed

1 file changed

+73
-74
lines changed

test/Prototypes/UnicodeDecoders.swift

Lines changed: 73 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ public struct _UIntBuffer<
4343
var _storage: Storage
4444
@_versioned
4545
var _bitCount: UInt8
46+
47+
@inline(__always)
48+
public init(containing e: Element) {
49+
_storage = Storage(extendingOrTruncating: e)
50+
_bitCount = UInt8(extendingOrTruncating: Element.bitWidth)
51+
}
4652
}
4753

4854
extension _UIntBuffer : Sequence {
@@ -222,7 +228,7 @@ public enum Unicode {
222228

223229
extension Unicode {
224230
public enum ParseResult<T> {
225-
case valid(T, length: Int)
231+
case valid(T)
226232
case emptyInput
227233
case invalid(length: Int)
228234

@@ -245,9 +251,9 @@ public protocol UnicodeDecoder {
245251

246252
mutating func parseOne<I : IteratorProtocol>(
247253
_ input: inout I
248-
) -> Unicode.ParseResult<UInt32> where I.Element == CodeUnit
254+
) -> Unicode.ParseResult<Buffer> where I.Element == CodeUnit
249255

250-
static func scalar(bufferStorage: UInt32, length: Int) -> UnicodeScalar
256+
static func decodeOne(_ content: Buffer) -> UnicodeScalar
251257
}
252258

253259
extension UnicodeDecoder {
@@ -264,8 +270,8 @@ extension UnicodeDecoder {
264270
var d = Self()
265271
while true {
266272
switch d.parseOne(&input) {
267-
case let .valid(bufferStorage, length: length):
268-
output(scalar(bufferStorage: bufferStorage, length: length))
273+
case let .valid(scalarContent):
274+
output(decodeOne(scalarContent))
269275
case .invalid:
270276
if !makeRepairs { return 1 }
271277
errors += 1
@@ -289,30 +295,30 @@ public protocol UnicodeEncoding {
289295

290296

291297
public protocol _UTF8Decoder : UnicodeDecoder {
292-
func _validateBuffer() -> (valid: Bool, length: UInt8)
298+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8)
293299
var buffer: Buffer { get set }
294300
}
295301

296-
extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
302+
extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
297303
public mutating func parseOne<I : IteratorProtocol>(
298304
_ input: inout I
299-
) -> Unicode.ParseResult<UInt32> where I.Element == Unicode.UTF8.CodeUnit {
305+
) -> Unicode.ParseResult<Buffer> where I.Element == Unicode.UTF8.CodeUnit {
300306

301307
// Bufferless ASCII fastpath.
302308
if _fastPath(buffer.isEmpty) {
303309
guard let codeUnit = input.next() else { return .emptyInput }
304310
// ASCII, return immediately.
305311
if codeUnit & 0x80 == 0 {
306-
return .valid(UInt32(codeUnit), length: 1)
312+
return .valid(Buffer(containing: codeUnit))
307313
}
308314
// Non-ASCII, proceed to buffering mode.
309315
buffer.append(codeUnit)
310316
} else if buffer._storage & 0x80 == 0 {
311317
// ASCII in buffer. We don't refill the buffer so we can return
312318
// to bufferless mode once we've exhausted it.
313-
let codeUnit = buffer._storage & 0xff
319+
let codeUnit = UInt8(extendingOrTruncating: buffer._storage)
314320
buffer.remove(at: buffer.startIndex)
315-
return .valid(codeUnit, length: 1)
321+
return .valid(Buffer(containing: codeUnit))
316322
}
317323
// Buffering mode.
318324
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
@@ -327,24 +333,22 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
327333
} while buffer._bitCount < 32
328334

329335
// Find one unicode scalar.
330-
// Note our empty bytes are always 0x00, which is required for this call.
331-
let (valid, length) = _validateBuffer()
332-
336+
let (isValid, scalarBitCount) = _parseNonASCII()
337+
_sanityCheck(scalarBitCount % 8 == 0 && 1...4 ~= scalarBitCount / 8)
338+
_sanityCheck(scalarBitCount <= buffer._bitCount)
339+
333340
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
334-
let bitsConsumed = 8 &* length
335-
_sanityCheck(1...4 ~= length && bitsConsumed <= buffer._bitCount)
336-
let savedBuffer = buffer._storage
341+
var encodedScalar = buffer
342+
encodedScalar._bitCount = scalarBitCount
337343

338344
buffer._storage = UInt32(
339345
// widen to 64 bits so that we can empty the buffer in the 4-byte case
340-
extendingOrTruncating: UInt64(buffer._storage) &>> bitsConsumed)
346+
extendingOrTruncating: UInt64(buffer._storage) &>> scalarBitCount)
341347

342-
buffer._bitCount = buffer._bitCount &- bitsConsumed
348+
buffer._bitCount = buffer._bitCount &- scalarBitCount
343349

344-
guard _fastPath(valid) else {
345-
return .invalid(length: Int(length))
346-
}
347-
return .valid(savedBuffer, length: Int(length))
350+
if _fastPath(isValid) { return .valid(encodedScalar) }
351+
return .invalid(length: Int(scalarBitCount &>> 3))
348352
}
349353
}
350354

@@ -364,41 +368,37 @@ extension Unicode.UTF8 : UnicodeEncoding {
364368
extension UTF8.ReverseDecoder : _UTF8Decoder {
365369
public typealias CodeUnit = UInt8
366370

367-
public static func scalar(bufferStorage: UInt32, length: Int) -> UnicodeScalar {
368-
switch length {
369-
case 1:
370-
return UnicodeScalar(_unchecked: bufferStorage & 0xff)
371-
case 2:
372-
var value = bufferStorage & 0b0______________________11_1111
373-
value |= bufferStorage &>> 2 & 0b0______________0111__1100_0000
371+
public static func decodeOne(_ encodedScalar: Buffer) -> UnicodeScalar {
372+
let bits = encodedScalar._storage
373+
switch encodedScalar._bitCount {
374+
case 8: return UnicodeScalar(_unchecked: bits)
375+
case 16:
376+
var value = bits & 0b0______________________11_1111
377+
value |= bits &>> 2 & 0b0______________0111__1100_0000
374378
return UnicodeScalar(_unchecked: value)
375-
case 3:
376-
var value = bufferStorage & 0b0______________________11_1111
377-
value |= bufferStorage &>> 2 & 0b0______________1111__1100_0000
378-
value |= bufferStorage &>> 4 & 0b0_________1111_0000__0000_0000
379+
case 24:
380+
var value = bits & 0b0______________________11_1111
381+
value |= bits &>> 2 & 0b0______________1111__1100_0000
382+
value |= bits &>> 4 & 0b0_________1111_0000__0000_0000
379383
return UnicodeScalar(_unchecked: value)
380384
default:
381-
_sanityCheck(length == 4)
382-
var value = bufferStorage & 0b0______________________11_1111
383-
value |= bufferStorage &>> 2 & 0b0______________1111__1100_0000
384-
value |= bufferStorage &>> 4 & 0b0_____11__1111_0000__0000_0000
385-
value |= bufferStorage &>> 6 & 0b0_1_1100__0000_0000__0000_0000
385+
_sanityCheck(encodedScalar._bitCount == 32)
386+
var value = bits & 0b0______________________11_1111
387+
value |= bits &>> 2 & 0b0______________1111__1100_0000
388+
value |= bits &>> 4 & 0b0_____11__1111_0000__0000_0000
389+
value |= bits &>> 6 & 0b0_1_1100__0000_0000__0000_0000
386390
return UnicodeScalar(_unchecked: value)
387391
}
388392
}
389393

390394
public // @testable
391-
func _validateBuffer() -> (valid: Bool, length: UInt8) {
392-
// FIXME: is this check eliminated when inlined into parseOne?
393-
if buffer._storage & 0x80 == 0 {
394-
return (true, 1)
395-
}
396-
395+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
396+
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
397397
if buffer._storage & 0b0__1110_0000__1100_0000
398398
== 0b0__1100_0000__1000_0000 {
399399
// 2-byte sequence. Top 4 bits of decoded result must be nonzero
400400
let top4Bits = buffer._storage & 0b0__0001_1110__0000_0000
401-
if _fastPath(top4Bits != 0) { return (true, 2) }
401+
if _fastPath(top4Bits != 0) { return (true, 2*8) }
402402
}
403403
else if buffer._storage & 0b0__1111_0000__1100_0000__1100_0000
404404
== 0b0__1110_0000__1000_0000__1000_0000 {
@@ -407,7 +407,7 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
407407
let top5Bits = buffer._storage & 0b0__1111__0010_0000__0000_0000
408408
if _fastPath(
409409
top5Bits != 0 && top5Bits != 0b0__1101__0010_0000__0000_0000) {
410-
return (true, 3)
410+
return (true, 3*8)
411411
}
412412
}
413413
else if buffer._storage & 0b0__1111_1000__1100_0000__1100_0000__1100_0000
@@ -418,9 +418,9 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
418418
if _fastPath(
419419
top5bits != 0
420420
&& top5bits <= 0b0__0100__0000_0000__0000_0000__0000_0000
421-
) { return (true, 4) }
421+
) { return (true, 4*8) }
422422
}
423-
return (false, _invalidLength())
423+
return (false, _invalidLength() &* 8)
424424
}
425425

426426
/// Returns the length of the invalid sequence that ends with the LSB of
@@ -459,24 +459,22 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
459459
public typealias CodeUnit = UInt8
460460

461461
public // @testable
462-
func _validateBuffer() -> (valid: Bool, length: UInt8) {
463-
if buffer._storage & 0x80 == 0 { // 1-byte sequence (ASCII), buffer: [ ... ... ... CU0 ].
464-
return (true, 1)
465-
}
466-
462+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
463+
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
464+
467465
if buffer._storage & 0b0__1100_0000__1110_0000
468466
== 0b0__1000_0000__1100_0000 {
469467
// 2-byte sequence. At least one of the top 4 bits of the decoded result
470468
// must be nonzero.
471-
if _fastPath(buffer._storage & 0b0_0001_1110 != 0) { return (true, 2) }
469+
if _fastPath(buffer._storage & 0b0_0001_1110 != 0) { return (true, 2*8) }
472470
}
473471
else if buffer._storage & 0b0__1100_0000__1100_0000__1111_0000
474472
== 0b0__1000_0000__1000_0000__1110_0000 {
475473
// 3-byte sequence. The top 5 bits of the decoded result must be nonzero
476474
// and not a surrogate
477475
let top5Bits = buffer._storage & 0b0___0010_0000__0000_1111
478476
if _fastPath(top5Bits != 0 && top5Bits != 0b0___0010_0000__0000_1101) {
479-
return (true, 3)
477+
return (true, 3*8)
480478
}
481479
}
482480
else if buffer._storage & 0b0__1100_0000__1100_0000__1100_0000__1111_1000
@@ -487,9 +485,9 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
487485
if _fastPath(
488486
top5bits != 0
489487
&& top5bits.byteSwapped <= 0b0__0000_0100__0000_0000
490-
) { return (true, 4) }
488+
) { return (true, 4*8) }
491489
}
492-
return (false, _invalidLength())
490+
return (false, _invalidLength() &* 8)
493491
}
494492

495493
/// Returns the length of the invalid sequence that starts with the LSB of
@@ -517,25 +515,26 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
517515
return 1
518516
}
519517

520-
public static func scalar(bufferStorage: UInt32, length: Int) -> UnicodeScalar {
521-
switch length {
522-
case 1:
523-
return UnicodeScalar(_unchecked: bufferStorage & 0xff)
524-
case 2:
525-
var value = (bufferStorage & 0b0_______________________11_1111__0000_0000) &>> 8
526-
value |= (bufferStorage & 0b0________________________________0001_1111) &<< 6
518+
public static func decodeOne(_ encodedScalar: Buffer) -> UnicodeScalar {
519+
let bits = encodedScalar._storage
520+
switch encodedScalar._bitCount {
521+
case 8:
522+
return UnicodeScalar(_unchecked: bits)
523+
case 16:
524+
var value = (bits & 0b0_______________________11_1111__0000_0000) &>> 8
525+
value |= (bits & 0b0________________________________0001_1111) &<< 6
527526
return UnicodeScalar(_unchecked: value)
528-
case 3:
529-
var value = (bufferStorage & 0b0____________11_1111__0000_0000__0000_0000) &>> 16
530-
value |= (bufferStorage & 0b0_______________________11_1111__0000_0000) &>> 2
531-
value |= (bufferStorage & 0b0________________________________0000_1111) &<< 12
527+
case 24:
528+
var value = (bits & 0b0____________11_1111__0000_0000__0000_0000) &>> 16
529+
value |= (bits & 0b0_______________________11_1111__0000_0000) &>> 2
530+
value |= (bits & 0b0________________________________0000_1111) &<< 12
532531
return UnicodeScalar(_unchecked: value)
533532
default:
534-
_sanityCheck(length == 4)
535-
var value = (bufferStorage & 0b0_11_1111__0000_0000__0000_0000__0000_0000) &>> 24
536-
value |= (bufferStorage & 0b0____________11_1111__0000_0000__0000_0000) &>> 10
537-
value |= (bufferStorage & 0b0_______________________11_1111__0000_0000) &<< 4
538-
value |= (bufferStorage & 0b0________________________________0000_0111) &<< 18
533+
_sanityCheck(encodedScalar.count == 4)
534+
var value = (bits & 0b0_11_1111__0000_0000__0000_0000__0000_0000) &>> 24
535+
value |= (bits & 0b0____________11_1111__0000_0000__0000_0000) &>> 10
536+
value |= (bits & 0b0_______________________11_1111__0000_0000) &<< 4
537+
value |= (bits & 0b0________________________________0000_0111) &<< 18
539538
return UnicodeScalar(_unchecked: value)
540539
}
541540
}

0 commit comments

Comments
 (0)