17
17
// The BASELINE timings come from the existing standard library Codecs
18
18
19
19
/*
20
- for x in BASELINE FORWARD REVERSE ; do
20
+ for x in BASELINE FORWARD REVERSE SEQUENCE COLLECTION ; do
21
21
echo $x
22
22
swiftc -DBENCHMARK -D$x -O -swift-version 4 UnicodeDecoders.swift -o /tmp/u3-$x
23
23
for i in {1..3}; do
27
27
*/
28
28
29
29
//===----------------------------------------------------------------------===//
30
- // Hack providing an efficient API that is available to the standard library
31
30
extension UnicodeScalar {
31
+ // Hack providing an efficient API that is available to the standard library
32
32
@_versioned
33
33
@inline ( __always)
34
34
init ( _unchecked x: UInt32 ) { self = unsafeBitCast ( x, to: UnicodeScalar . self) }
35
+
36
+ static var replacementCharacter : UnicodeScalar {
37
+ return UnicodeScalar ( _unchecked: 0xfffd )
38
+ }
35
39
}
36
40
//===----------------------------------------------------------------------===//
37
41
@_fixed_layout
@@ -44,6 +48,13 @@ public struct _UIntBuffer<
44
48
@_versioned
45
49
var _bitCount : UInt8
46
50
51
+ @inline ( __always)
52
+ @_versioned
53
+ internal init ( _storage: Storage , _bitCount: UInt8 ) {
54
+ self . _storage = _storage
55
+ self . _bitCount = _bitCount
56
+ }
57
+
47
58
@inline ( __always)
48
59
public init ( containing e: Element ) {
49
60
_storage = Storage ( extendingOrTruncating: e)
@@ -53,7 +64,7 @@ public struct _UIntBuffer<
53
64
54
65
extension _UIntBuffer : Sequence {
55
66
@_fixed_layout
56
- public struct Iterator : IteratorProtocol {
67
+ public struct Iterator : IteratorProtocol , Sequence {
57
68
@inline ( __always)
58
69
public init ( _ x: _UIntBuffer ) { _impl = x }
59
70
@@ -244,9 +255,12 @@ extension Unicode {
244
255
public protocol UnicodeDecoder {
245
256
associatedtype CodeUnit : UnsignedInteger , FixedWidthInteger
246
257
associatedtype Buffer : Collection
247
- where Buffer. Iterator. Element == CodeUnit
258
+
259
+ where Buffer. Iterator. Element == CodeUnit
260
+
248
261
associatedtype EncodedScalar : Collection
249
262
where EncodedScalar. Iterator. Element == CodeUnit
263
+ static var replacement : EncodedScalar { get }
250
264
251
265
init ( )
252
266
@@ -286,6 +300,166 @@ extension UnicodeDecoder {
286
300
}
287
301
}
288
302
303
+
304
+ extension Unicode {
305
+ struct ParsingIterator <
306
+ CodeUnits : IteratorProtocol ,
307
+ Decoder: UnicodeDecoder
308
+ > where Decoder. CodeUnit == CodeUnits . Element {
309
+ var codeUnits : CodeUnits
310
+ var decoder : Decoder
311
+ }
312
+ }
313
+ extension Unicode . ParsingIterator : IteratorProtocol , Sequence {
314
+ mutating func next( ) -> Decoder . EncodedScalar ? {
315
+ switch decoder. parseOne ( & codeUnits) {
316
+ case let . valid( scalarContent) : return scalarContent
317
+ case . invalid: return Decoder . replacement
318
+ case . emptyInput: return nil
319
+ }
320
+ }
321
+ }
322
+
323
+ extension Unicode {
324
+ struct DefaultScalarView <
325
+ CodeUnits: BidirectionalCollection ,
326
+ Encoding: UnicodeEncoding
327
+ > where CodeUnits. Iterator. Element == Encoding . CodeUnit {
328
+ var codeUnits : CodeUnits
329
+ }
330
+ }
331
+
332
+ extension Unicode . DefaultScalarView : Sequence {
333
+ struct Iterator {
334
+ var parsing : Unicode . ParsingIterator <
335
+ CodeUnits . Iterator , Encoding . ForwardDecoder
336
+ >
337
+ }
338
+
339
+ func makeIterator( ) -> Iterator {
340
+ return Iterator (
341
+ parsing: Unicode . ParsingIterator (
342
+ codeUnits: codeUnits. makeIterator ( ) ,
343
+ decoder: Encoding . ForwardDecoder ( )
344
+ ) )
345
+ }
346
+ }
347
+
348
+ extension Unicode . DefaultScalarView . Iterator : IteratorProtocol , Sequence {
349
+ mutating func next( ) -> UnicodeScalar ? {
350
+ return parsing. next ( ) . map {
351
+ Encoding . ForwardDecoder. decodeOne ( $0)
352
+ }
353
+ }
354
+ }
355
+
356
+ extension Unicode {
357
+ enum IndexImpl < E: UnicodeEncoding > {
358
+ case forward( E . ForwardDecoder , E . ForwardDecoder . EncodedScalar )
359
+ case reverse( E . ReverseDecoder , E . ReverseDecoder . EncodedScalar )
360
+ }
361
+ }
362
+ extension Unicode . DefaultScalarView {
363
+ struct Index {
364
+ var parsedLength : UInt8
365
+ var impl : Unicode . IndexImpl < Encoding >
366
+ var codeUnitIndex : CodeUnits . Index
367
+ }
368
+ }
369
+
370
+ extension Unicode . DefaultScalarView . Index : Comparable {
371
+ static func < (
372
+ lhs: Unicode . DefaultScalarView < CodeUnits , Encoding > . Index ,
373
+ rhs: Unicode . DefaultScalarView < CodeUnits , Encoding > . Index
374
+ ) -> Bool {
375
+ return lhs. codeUnitIndex < rhs. codeUnitIndex
376
+ }
377
+
378
+ static func == (
379
+ lhs: Unicode . DefaultScalarView < CodeUnits , Encoding > . Index ,
380
+ rhs: Unicode . DefaultScalarView < CodeUnits , Encoding > . Index
381
+ ) -> Bool {
382
+ return lhs. codeUnitIndex == rhs. codeUnitIndex
383
+ }
384
+ }
385
+
386
+ extension Unicode . DefaultScalarView : Collection {
387
+ func _forwardIndex( atCodeUnit i: CodeUnits . Index ) -> Index {
388
+ return index (
389
+ after: Index (
390
+ parsedLength: 0 ,
391
+ impl: . forward(
392
+ Encoding . ForwardDecoder ( ) ,
393
+ Encoding . ForwardDecoder. replacement) ,
394
+ codeUnitIndex: i
395
+ ) )
396
+ }
397
+
398
+ var startIndex : Index {
399
+ return codeUnits. isEmpty ? endIndex
400
+ : _forwardIndex ( atCodeUnit: codeUnits. startIndex)
401
+ }
402
+
403
+ var endIndex : Index {
404
+ return Index (
405
+ parsedLength: 0 ,
406
+ impl: . reverse(
407
+ Encoding . ReverseDecoder ( ) ,
408
+ Encoding . ReverseDecoder. replacement) ,
409
+ codeUnitIndex: codeUnits. endIndex
410
+ )
411
+ }
412
+
413
+ subscript( i: Index ) -> UnicodeScalar {
414
+ switch i. impl {
415
+ case . forward( _, let s) :
416
+ return Encoding . ForwardDecoder. decodeOne ( s)
417
+ case . reverse( _, let s) :
418
+ return Encoding . ReverseDecoder. decodeOne ( s)
419
+ }
420
+ }
421
+
422
+ func index( after i: Index ) -> Index {
423
+ switch i. impl {
424
+ case . forward( var d, _) :
425
+ let stride = i. parsedLength
426
+
427
+ // position of the code unit after the last one we've processed
428
+ let i0 = codeUnits. index (
429
+ i. codeUnitIndex,
430
+ offsetBy: CodeUnits . IndexDistance ( d. buffer. count) + numericCast( stride) )
431
+
432
+ var tail = codeUnits [ i0..< codeUnits. endIndex] . makeIterator ( )
433
+ switch d. parseOne ( & tail) {
434
+
435
+ case . valid( let s) :
436
+ return Index (
437
+ parsedLength: UInt8 ( extendingOrTruncating: s. count) ,
438
+ impl: . forward( d, s) ,
439
+ codeUnitIndex:
440
+ codeUnits. index ( i. codeUnitIndex, offsetBy: numericCast ( stride) ) )
441
+
442
+ case . invalid( let l) :
443
+ return Index (
444
+ parsedLength: UInt8 ( extendingOrTruncating: l) ,
445
+ impl: . forward( d, Encoding . ForwardDecoder. replacement) ,
446
+ codeUnitIndex:
447
+ codeUnits. index ( i. codeUnitIndex, offsetBy: numericCast ( stride) ) )
448
+
449
+ case . emptyInput:
450
+ return endIndex
451
+ }
452
+
453
+ case . reverse( _, _) :
454
+ fatalError ( " implement me " )
455
+ // The following has the right semantics but kills inlining. Needs a
456
+ // refactor to be right.
457
+ //
458
+ // return index(after: _forwardIndex(atCodeUnit: i.codeUnitIndex))
459
+ }
460
+ }
461
+ }
462
+
289
463
public protocol UnicodeEncoding {
290
464
associatedtype CodeUnit
291
465
@@ -359,22 +533,25 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
359
533
extension Unicode . UTF8 : UnicodeEncoding {
360
534
public struct ForwardDecoder {
361
535
public typealias Buffer = _UIntBuffer < UInt32 , UInt8 >
362
- public typealias EncodedScalar = Buffer
363
536
public init ( ) { buffer = Buffer ( ) }
364
537
public var buffer : Buffer
365
538
}
366
539
367
540
public struct ReverseDecoder {
368
541
public typealias Buffer = _UIntBuffer < UInt32 , UInt8 >
369
- public typealias EncodedScalar = Buffer
370
542
public init ( ) { buffer = Buffer ( ) }
371
543
public var buffer : Buffer
372
544
}
373
545
}
374
546
375
547
extension UTF8 . ReverseDecoder : _UTF8Decoder {
376
548
public typealias CodeUnit = UInt8
549
+ public typealias EncodedScalar = Buffer
377
550
551
+ public static var replacement : EncodedScalar {
552
+ return EncodedScalar ( _storage: 0xefbfbd , _bitCount: 24 )
553
+ }
554
+
378
555
public static func decodeOne( _ source: EncodedScalar ) -> UnicodeScalar {
379
556
let bits = source. _storage
380
557
switch source. _bitCount {
@@ -464,6 +641,11 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
464
641
465
642
extension Unicode . UTF8 . ForwardDecoder : _UTF8Decoder {
466
643
public typealias CodeUnit = UInt8
644
+ public typealias EncodedScalar = Buffer
645
+
646
+ public static var replacement : EncodedScalar {
647
+ return EncodedScalar ( _storage: 0xbdbfef , _bitCount: 24 )
648
+ }
467
649
468
650
public // @testable
469
651
func _parseNonASCII( ) -> ( isValid: Bool , bitCount: UInt8 ) {
@@ -642,6 +824,26 @@ func checkDecodeUTF8(
642
824
_ expectedHead: [ UInt32 ] ,
643
825
_ expectedRepairedTail: [ UInt32 ] , _ utf8Str: [ UInt8 ]
644
826
) -> AssertionResult {
827
+ var expected : [ UnicodeScalar ] = [ ]
828
+ do {
829
+ var i = utf8Str. makeIterator ( )
830
+ UTF8 . ForwardDecoder. decode ( & i, repairingIllFormedSequences: true ) {
831
+ expected. append ( $0)
832
+ }
833
+ }
834
+
835
+ let scalars = Unicode . DefaultScalarView < [ UInt8 ] , UTF8 > ( codeUnits: utf8Str)
836
+ expectEqualSequence ( expected, scalars)
837
+
838
+ do {
839
+ var x = scalars. makeIterator ( )
840
+ var j = scalars. startIndex
841
+ while ( j != scalars. endIndex) {
842
+ expectEqual ( x. next ( ) !, scalars [ j] )
843
+ j = scalars. index ( after: j)
844
+ }
845
+ expectNil ( x. next ( ) )
846
+ }
645
847
return checkDecodeUTF ( UTF8 . self, expectedHead, expectedRepairedTail, utf8Str)
646
848
}
647
849
@@ -2266,13 +2468,25 @@ public func run_UTF8Decode(_ N: Int) {
2266
2468
#if FORWARD
2267
2469
var it = string. makeIterator ( )
2268
2470
typealias D = UTF8 . ForwardDecoder
2471
+ D . decode ( & it, repairingIllFormedSequences: true ) { total = total &+ $0. value }
2269
2472
#elseif REVERSE
2270
2473
var it = string. reversed ( ) . makeIterator ( )
2271
2474
typealias D = UTF8 . ReverseDecoder
2475
+ D . decode ( & it, repairingIllFormedSequences: true ) { total = total &+ $0. value }
2476
+ #elseif SEQUENCE
2477
+ for s in Unicode . DefaultScalarView < [ UInt8 ] , UTF8 > ( codeUnits: string) {
2478
+ total = total &+ s. value
2479
+ }
2480
+ #elseif COLLECTION
2481
+ let scalars = Unicode . DefaultScalarView < [ UInt8 ] , UTF8 > ( codeUnits: string)
2482
+ var i = scalars. startIndex
2483
+ while i != scalars. endIndex {
2484
+ total = total &+ scalars [ i] . value
2485
+ i = scalars. index ( after: i)
2486
+ }
2272
2487
#else
2273
2488
Error_Unknown_Benchmark ( )
2274
2489
#endif
2275
- D . decode ( & it, repairingIllFormedSequences: true ) { total = total &+ $0. value }
2276
2490
#endif
2277
2491
}
2278
2492
}
0 commit comments