@@ -43,6 +43,12 @@ public struct _UIntBuffer<
43
43
var _storage : Storage
44
44
@_versioned
45
45
var _bitCount : UInt8
46
+
47
+ @inline ( __always)
48
+ public init ( containing e: Element ) {
49
+ _storage = Storage ( extendingOrTruncating: e)
50
+ _bitCount = UInt8 ( extendingOrTruncating: Element . bitWidth)
51
+ }
46
52
}
47
53
48
54
extension _UIntBuffer : Sequence {
@@ -222,7 +228,7 @@ public enum Unicode {
222
228
223
229
extension Unicode {
224
230
public enum ParseResult < T> {
225
- case valid( T , length : Int )
231
+ case valid( T )
226
232
case emptyInput
227
233
case invalid( length: Int )
228
234
@@ -245,9 +251,9 @@ public protocol UnicodeDecoder {
245
251
246
252
mutating func parseOne< I : IteratorProtocol > (
247
253
_ input: inout I
248
- ) -> Unicode . ParseResult < UInt32 > where I. Element == CodeUnit
254
+ ) -> Unicode . ParseResult < Buffer > where I. Element == CodeUnit
249
255
250
- static func scalar ( bufferStorage : UInt32 , length : Int ) -> UnicodeScalar
256
+ static func decodeOne ( _ content : Buffer ) -> UnicodeScalar
251
257
}
252
258
253
259
extension UnicodeDecoder {
@@ -264,8 +270,8 @@ extension UnicodeDecoder {
264
270
var d = Self ( )
265
271
while true {
266
272
switch d. parseOne ( & input) {
267
- case let . valid( bufferStorage , length : length ) :
268
- output ( scalar ( bufferStorage : bufferStorage , length : length ) )
273
+ case let . valid( scalarContent ) :
274
+ output ( decodeOne ( scalarContent ) )
269
275
case . invalid:
270
276
if !makeRepairs { return 1 }
271
277
errors += 1
@@ -289,30 +295,30 @@ public protocol UnicodeEncoding {
289
295
290
296
291
297
public protocol _UTF8Decoder : UnicodeDecoder {
292
- func _validateBuffer ( ) -> ( valid : Bool , length : UInt8 )
298
+ func _parseNonASCII ( ) -> ( isValid : Bool , bitCount : UInt8 )
293
299
var buffer : Buffer { get set }
294
300
}
295
301
296
- extension _UTF8Decoder where Buffer == _UIntBuffer < UInt32 , UInt8 > {
302
+ extension _UTF8Decoder where Buffer == _UIntBuffer < UInt32 , UInt8 > {
297
303
public mutating func parseOne< I : IteratorProtocol > (
298
304
_ input: inout I
299
- ) -> Unicode . ParseResult < UInt32 > where I. Element == Unicode . UTF8 . CodeUnit {
305
+ ) -> Unicode . ParseResult < Buffer > where I. Element == Unicode . UTF8 . CodeUnit {
300
306
301
307
// Bufferless ASCII fastpath.
302
308
if _fastPath ( buffer. isEmpty) {
303
309
guard let codeUnit = input. next ( ) else { return . emptyInput }
304
310
// ASCII, return immediately.
305
311
if codeUnit & 0x80 == 0 {
306
- return . valid( UInt32 ( codeUnit ) , length : 1 )
312
+ return . valid( Buffer ( containing : codeUnit ) )
307
313
}
308
314
// Non-ASCII, proceed to buffering mode.
309
315
buffer. append ( codeUnit)
310
316
} else if buffer. _storage & 0x80 == 0 {
311
317
// ASCII in buffer. We don't refill the buffer so we can return
312
318
// to bufferless mode once we've exhausted it.
313
- let codeUnit = buffer. _storage & 0xff
319
+ let codeUnit = UInt8 ( extendingOrTruncating : buffer. _storage)
314
320
buffer. remove ( at: buffer. startIndex)
315
- return . valid( codeUnit , length : 1 )
321
+ return . valid( Buffer ( containing : codeUnit ) )
316
322
}
317
323
// Buffering mode.
318
324
// Fill buffer back to 4 bytes (or as many as are left in the iterator).
@@ -327,24 +333,22 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
327
333
} while buffer. _bitCount < 32
328
334
329
335
// Find one unicode scalar.
330
- // Note our empty bytes are always 0x00, which is required for this call.
331
- let ( valid, length) = _validateBuffer ( )
332
-
336
+ let ( isValid, scalarBitCount) = _parseNonASCII ( )
337
+ _sanityCheck ( scalarBitCount % 8 == 0 && 1 ... 4 ~= scalarBitCount / 8 )
338
+ _sanityCheck ( scalarBitCount <= buffer. _bitCount)
339
+
333
340
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
334
- let bitsConsumed = 8 &* length
335
- _sanityCheck ( 1 ... 4 ~= length && bitsConsumed <= buffer. _bitCount)
336
- let savedBuffer = buffer. _storage
341
+ var encodedScalar = buffer
342
+ encodedScalar. _bitCount = scalarBitCount
337
343
338
344
buffer. _storage = UInt32 (
339
345
// widen to 64 bits so that we can empty the buffer in the 4-byte case
340
- extendingOrTruncating: UInt64 ( buffer. _storage) &>> bitsConsumed )
346
+ extendingOrTruncating: UInt64 ( buffer. _storage) &>> scalarBitCount )
341
347
342
- buffer. _bitCount = buffer. _bitCount &- bitsConsumed
348
+ buffer. _bitCount = buffer. _bitCount &- scalarBitCount
343
349
344
- guard _fastPath ( valid) else {
345
- return . invalid( length: Int ( length) )
346
- }
347
- return . valid( savedBuffer, length: Int ( length) )
350
+ if _fastPath ( isValid) { return . valid( encodedScalar) }
351
+ return . invalid( length: Int ( scalarBitCount &>> 3 ) )
348
352
}
349
353
}
350
354
@@ -364,41 +368,37 @@ extension Unicode.UTF8 : UnicodeEncoding {
364
368
extension UTF8 . ReverseDecoder : _UTF8Decoder {
365
369
public typealias CodeUnit = UInt8
366
370
367
- public static func scalar ( bufferStorage : UInt32 , length : Int ) -> UnicodeScalar {
368
- switch length {
369
- case 1 :
370
- return UnicodeScalar ( _unchecked: bufferStorage & 0xff )
371
- case 2 :
372
- var value = bufferStorage & 0b0______________________11_1111
373
- value |= bufferStorage &>> 2 & 0b0______________0111__1100_0000
371
+ public static func decodeOne ( _ encodedScalar : Buffer ) -> UnicodeScalar {
372
+ let bits = encodedScalar . _storage
373
+ switch encodedScalar . _bitCount {
374
+ case 8 : return UnicodeScalar ( _unchecked: bits )
375
+ case 16 :
376
+ var value = bits & 0b0______________________11_1111
377
+ value |= bits &>> 2 & 0b0______________0111__1100_0000
374
378
return UnicodeScalar ( _unchecked: value)
375
- case 3 :
376
- var value = bufferStorage & 0b0______________________11_1111
377
- value |= bufferStorage &>> 2 & 0b0______________1111__1100_0000
378
- value |= bufferStorage &>> 4 & 0b0_________1111_0000__0000_0000
379
+ case 24 :
380
+ var value = bits & 0b0______________________11_1111
381
+ value |= bits &>> 2 & 0b0______________1111__1100_0000
382
+ value |= bits &>> 4 & 0b0_________1111_0000__0000_0000
379
383
return UnicodeScalar ( _unchecked: value)
380
384
default :
381
- _sanityCheck ( length == 4 )
382
- var value = bufferStorage & 0b0______________________11_1111
383
- value |= bufferStorage &>> 2 & 0b0______________1111__1100_0000
384
- value |= bufferStorage &>> 4 & 0b0_____11__1111_0000__0000_0000
385
- value |= bufferStorage &>> 6 & 0b0_1_1100__0000_0000__0000_0000
385
+ _sanityCheck ( encodedScalar . _bitCount == 32 )
386
+ var value = bits & 0b0______________________11_1111
387
+ value |= bits &>> 2 & 0b0______________1111__1100_0000
388
+ value |= bits &>> 4 & 0b0_____11__1111_0000__0000_0000
389
+ value |= bits &>> 6 & 0b0_1_1100__0000_0000__0000_0000
386
390
return UnicodeScalar ( _unchecked: value)
387
391
}
388
392
}
389
393
390
394
public // @testable
391
- func _validateBuffer( ) -> ( valid: Bool , length: UInt8 ) {
392
- // FIXME: is this check eliminated when inlined into parseOne?
393
- if buffer. _storage & 0x80 == 0 {
394
- return ( true , 1 )
395
- }
396
-
395
+ func _parseNonASCII( ) -> ( isValid: Bool , bitCount: UInt8 ) {
396
+ _sanityCheck ( buffer. _storage & 0x80 != 0 ) // this case handled elsewhere
397
397
if buffer. _storage & 0b0__1110_0000__1100_0000
398
398
== 0b0__1100_0000__1000_0000 {
399
399
// 2-byte sequence. Top 4 bits of decoded result must be nonzero
400
400
let top4Bits = buffer. _storage & 0b0__0001_1110__0000_0000
401
- if _fastPath ( top4Bits != 0 ) { return ( true , 2 ) }
401
+ if _fastPath ( top4Bits != 0 ) { return ( true , 2 * 8 ) }
402
402
}
403
403
else if buffer. _storage & 0b0__1111_0000__1100_0000__1100_0000
404
404
== 0b0__1110_0000__1000_0000__1000_0000 {
@@ -407,7 +407,7 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
407
407
let top5Bits = buffer. _storage & 0b0__1111__0010_0000__0000_0000
408
408
if _fastPath (
409
409
top5Bits != 0 && top5Bits != 0b0__1101__0010_0000__0000_0000 ) {
410
- return ( true , 3 )
410
+ return ( true , 3 * 8 )
411
411
}
412
412
}
413
413
else if buffer. _storage & 0b0__1111_1000__1100_0000__1100_0000__1100_0000
@@ -418,9 +418,9 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
418
418
if _fastPath (
419
419
top5bits != 0
420
420
&& top5bits <= 0b0__0100__0000_0000__0000_0000__0000_0000
421
- ) { return ( true , 4 ) }
421
+ ) { return ( true , 4 * 8 ) }
422
422
}
423
- return ( false , _invalidLength ( ) )
423
+ return ( false , _invalidLength ( ) &* 8 )
424
424
}
425
425
426
426
/// Returns the length of the invalid sequence that ends with the LSB of
@@ -459,24 +459,22 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
459
459
public typealias CodeUnit = UInt8
460
460
461
461
public // @testable
462
- func _validateBuffer( ) -> ( valid: Bool , length: UInt8 ) {
463
- if buffer. _storage & 0x80 == 0 { // 1-byte sequence (ASCII), buffer: [ ... ... ... CU0 ].
464
- return ( true , 1 )
465
- }
466
-
462
+ func _parseNonASCII( ) -> ( isValid: Bool , bitCount: UInt8 ) {
463
+ _sanityCheck ( buffer. _storage & 0x80 != 0 ) // this case handled elsewhere
464
+
467
465
if buffer. _storage & 0b0__1100_0000__1110_0000
468
466
== 0b0__1000_0000__1100_0000 {
469
467
// 2-byte sequence. At least one of the top 4 bits of the decoded result
470
468
// must be nonzero.
471
- if _fastPath ( buffer. _storage & 0b0_0001_1110 != 0 ) { return ( true , 2 ) }
469
+ if _fastPath ( buffer. _storage & 0b0_0001_1110 != 0 ) { return ( true , 2 * 8 ) }
472
470
}
473
471
else if buffer. _storage & 0b0__1100_0000__1100_0000__1111_0000
474
472
== 0b0__1000_0000__1000_0000__1110_0000 {
475
473
// 3-byte sequence. The top 5 bits of the decoded result must be nonzero
476
474
// and not a surrogate
477
475
let top5Bits = buffer. _storage & 0b0___0010_0000__0000_1111
478
476
if _fastPath ( top5Bits != 0 && top5Bits != 0b0___0010_0000__0000_1101 ) {
479
- return ( true , 3 )
477
+ return ( true , 3 * 8 )
480
478
}
481
479
}
482
480
else if buffer. _storage & 0b0__1100_0000__1100_0000__1100_0000__1111_1000
@@ -487,9 +485,9 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
487
485
if _fastPath (
488
486
top5bits != 0
489
487
&& top5bits. byteSwapped <= 0b0__0000_0100__0000_0000
490
- ) { return ( true , 4 ) }
488
+ ) { return ( true , 4 * 8 ) }
491
489
}
492
- return ( false , _invalidLength ( ) )
490
+ return ( false , _invalidLength ( ) &* 8 )
493
491
}
494
492
495
493
/// Returns the length of the invalid sequence that starts with the LSB of
@@ -517,25 +515,26 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
517
515
return 1
518
516
}
519
517
520
- public static func scalar( bufferStorage: UInt32 , length: Int ) -> UnicodeScalar {
521
- switch length {
522
- case 1 :
523
- return UnicodeScalar ( _unchecked: bufferStorage & 0xff )
524
- case 2 :
525
- var value = ( bufferStorage & 0b0_______________________11_1111__0000_0000 ) &>> 8
526
- value |= ( bufferStorage & 0b0________________________________0001_1111 ) &<< 6
518
+ public static func decodeOne( _ encodedScalar: Buffer ) -> UnicodeScalar {
519
+ let bits = encodedScalar. _storage
520
+ switch encodedScalar. _bitCount {
521
+ case 8 :
522
+ return UnicodeScalar ( _unchecked: bits)
523
+ case 16 :
524
+ var value = ( bits & 0b0_______________________11_1111__0000_0000 ) &>> 8
525
+ value |= ( bits & 0b0________________________________0001_1111 ) &<< 6
527
526
return UnicodeScalar ( _unchecked: value)
528
- case 3 :
529
- var value = ( bufferStorage & 0b0____________11_1111__0000_0000__0000_0000 ) &>> 16
530
- value |= ( bufferStorage & 0b0_______________________11_1111__0000_0000 ) &>> 2
531
- value |= ( bufferStorage & 0b0________________________________0000_1111 ) &<< 12
527
+ case 24 :
528
+ var value = ( bits & 0b0____________11_1111__0000_0000__0000_0000 ) &>> 16
529
+ value |= ( bits & 0b0_______________________11_1111__0000_0000 ) &>> 2
530
+ value |= ( bits & 0b0________________________________0000_1111 ) &<< 12
532
531
return UnicodeScalar ( _unchecked: value)
533
532
default :
534
- _sanityCheck ( length == 4 )
535
- var value = ( bufferStorage & 0b0_11_1111__0000_0000__0000_0000__0000_0000 ) &>> 24
536
- value |= ( bufferStorage & 0b0____________11_1111__0000_0000__0000_0000 ) &>> 10
537
- value |= ( bufferStorage & 0b0_______________________11_1111__0000_0000 ) &<< 4
538
- value |= ( bufferStorage & 0b0________________________________0000_0111 ) &<< 18
533
+ _sanityCheck ( encodedScalar . count == 4 )
534
+ var value = ( bits & 0b0_11_1111__0000_0000__0000_0000__0000_0000 ) &>> 24
535
+ value |= ( bits & 0b0____________11_1111__0000_0000__0000_0000 ) &>> 10
536
+ value |= ( bits & 0b0_______________________11_1111__0000_0000 ) &<< 4
537
+ value |= ( bits & 0b0________________________________0000_0111 ) &<< 18
539
538
return UnicodeScalar ( _unchecked: value)
540
539
}
541
540
}
0 commit comments