Skip to content

Commit 662ea1f

Browse files
author
Dave Abrahams
committed
[stdlib] UnicodeDecoders: basic view proof-of-concept
Implements Sequence and Collection views over arbitrary CodeUnits, demonstrating that indexing is possible. However, collection-style decoding is approximately 3x slower than sequence-style, because Decoders have a fundamentally Sequence-oriented interface. I think I know what needs to be done to get parity.
1 parent 63a2033 commit 662ea1f

File tree

1 file changed

+221
-7
lines changed

1 file changed

+221
-7
lines changed

test/Prototypes/UnicodeDecoders.swift

Lines changed: 221 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
// The BASELINE timings come from the existing standard library Codecs
1818

1919
/*
20-
for x in BASELINE FORWARD REVERSE ; do
20+
for x in BASELINE FORWARD REVERSE SEQUENCE COLLECTION ; do
2121
echo $x
2222
swiftc -DBENCHMARK -D$x -O -swift-version 4 UnicodeDecoders.swift -o /tmp/u3-$x
2323
for i in {1..3}; do
@@ -27,11 +27,15 @@
2727
*/
2828

2929
//===----------------------------------------------------------------------===//
30-
// Hack providing an efficient API that is available to the standard library
3130
extension UnicodeScalar {
31+
// Hack providing an efficient API that is available to the standard library
3232
@_versioned
3333
@inline(__always)
3434
init(_unchecked x: UInt32) { self = unsafeBitCast(x, to: UnicodeScalar.self) }
35+
36+
static var replacementCharacter: UnicodeScalar {
37+
return UnicodeScalar(_unchecked: 0xfffd)
38+
}
3539
}
3640
//===----------------------------------------------------------------------===//
3741
@_fixed_layout
@@ -44,6 +48,13 @@ public struct _UIntBuffer<
4448
@_versioned
4549
var _bitCount: UInt8
4650

51+
@inline(__always)
52+
@_versioned
53+
internal init(_storage: Storage, _bitCount: UInt8) {
54+
self._storage = _storage
55+
self._bitCount = _bitCount
56+
}
57+
4758
@inline(__always)
4859
public init(containing e: Element) {
4960
_storage = Storage(extendingOrTruncating: e)
@@ -53,7 +64,7 @@ public struct _UIntBuffer<
5364

5465
extension _UIntBuffer : Sequence {
5566
@_fixed_layout
56-
public struct Iterator : IteratorProtocol {
67+
public struct Iterator : IteratorProtocol, Sequence {
5768
@inline(__always)
5869
public init(_ x: _UIntBuffer) { _impl = x }
5970

@@ -244,9 +255,12 @@ extension Unicode {
244255
public protocol UnicodeDecoder {
245256
associatedtype CodeUnit : UnsignedInteger, FixedWidthInteger
246257
associatedtype Buffer : Collection
247-
where Buffer.Iterator.Element == CodeUnit
258+
259+
where Buffer.Iterator.Element == CodeUnit
260+
248261
associatedtype EncodedScalar : Collection
249262
where EncodedScalar.Iterator.Element == CodeUnit
263+
static var replacement: EncodedScalar { get }
250264

251265
init()
252266

@@ -286,6 +300,166 @@ extension UnicodeDecoder {
286300
}
287301
}
288302

303+
304+
extension Unicode {
305+
struct ParsingIterator<
306+
CodeUnits : IteratorProtocol,
307+
Decoder: UnicodeDecoder
308+
> where Decoder.CodeUnit == CodeUnits.Element {
309+
var codeUnits: CodeUnits
310+
var decoder: Decoder
311+
}
312+
}
313+
extension Unicode.ParsingIterator : IteratorProtocol, Sequence {
314+
mutating func next() -> Decoder.EncodedScalar? {
315+
switch decoder.parseOne(&codeUnits) {
316+
case let .valid(scalarContent): return scalarContent
317+
case .invalid: return Decoder.replacement
318+
case .emptyInput: return nil
319+
}
320+
}
321+
}
322+
323+
extension Unicode {
324+
struct DefaultScalarView<
325+
CodeUnits: BidirectionalCollection,
326+
Encoding: UnicodeEncoding
327+
> where CodeUnits.Iterator.Element == Encoding.CodeUnit {
328+
var codeUnits: CodeUnits
329+
}
330+
}
331+
332+
extension Unicode.DefaultScalarView : Sequence {
333+
struct Iterator {
334+
var parsing: Unicode.ParsingIterator<
335+
CodeUnits.Iterator, Encoding.ForwardDecoder
336+
>
337+
}
338+
339+
func makeIterator() -> Iterator {
340+
return Iterator(
341+
parsing: Unicode.ParsingIterator(
342+
codeUnits: codeUnits.makeIterator(),
343+
decoder: Encoding.ForwardDecoder()
344+
))
345+
}
346+
}
347+
348+
extension Unicode.DefaultScalarView.Iterator : IteratorProtocol, Sequence {
349+
mutating func next() -> UnicodeScalar? {
350+
return parsing.next().map {
351+
Encoding.ForwardDecoder.decodeOne($0)
352+
}
353+
}
354+
}
355+
356+
extension Unicode {
357+
enum IndexImpl<E: UnicodeEncoding> {
358+
case forward(E.ForwardDecoder, E.ForwardDecoder.EncodedScalar)
359+
case reverse(E.ReverseDecoder, E.ReverseDecoder.EncodedScalar)
360+
}
361+
}
362+
extension Unicode.DefaultScalarView {
363+
struct Index {
364+
var parsedLength: UInt8
365+
var impl: Unicode.IndexImpl<Encoding>
366+
var codeUnitIndex: CodeUnits.Index
367+
}
368+
}
369+
370+
extension Unicode.DefaultScalarView.Index : Comparable {
371+
static func < (
372+
lhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index,
373+
rhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index
374+
) -> Bool {
375+
return lhs.codeUnitIndex < rhs.codeUnitIndex
376+
}
377+
378+
static func == (
379+
lhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index,
380+
rhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index
381+
) -> Bool {
382+
return lhs.codeUnitIndex == rhs.codeUnitIndex
383+
}
384+
}
385+
386+
extension Unicode.DefaultScalarView : Collection {
387+
func _forwardIndex(atCodeUnit i: CodeUnits.Index) -> Index {
388+
return index(
389+
after: Index(
390+
parsedLength: 0,
391+
impl: .forward(
392+
Encoding.ForwardDecoder(),
393+
Encoding.ForwardDecoder.replacement),
394+
codeUnitIndex: i
395+
))
396+
}
397+
398+
var startIndex: Index {
399+
return codeUnits.isEmpty ? endIndex
400+
: _forwardIndex(atCodeUnit: codeUnits.startIndex)
401+
}
402+
403+
var endIndex: Index {
404+
return Index(
405+
parsedLength: 0,
406+
impl: .reverse(
407+
Encoding.ReverseDecoder(),
408+
Encoding.ReverseDecoder.replacement),
409+
codeUnitIndex: codeUnits.endIndex
410+
)
411+
}
412+
413+
subscript(i: Index) -> UnicodeScalar {
414+
switch i.impl {
415+
case .forward(_, let s):
416+
return Encoding.ForwardDecoder.decodeOne(s)
417+
case .reverse(_, let s):
418+
return Encoding.ReverseDecoder.decodeOne(s)
419+
}
420+
}
421+
422+
func index(after i: Index) -> Index {
423+
switch i.impl {
424+
case .forward(var d, _):
425+
let stride = i.parsedLength
426+
427+
// position of the code unit after the last one we've processed
428+
let i0 = codeUnits.index(
429+
i.codeUnitIndex,
430+
offsetBy: CodeUnits.IndexDistance(d.buffer.count) + numericCast(stride))
431+
432+
var tail = codeUnits[i0..<codeUnits.endIndex].makeIterator()
433+
switch d.parseOne(&tail) {
434+
435+
case .valid(let s):
436+
return Index(
437+
parsedLength: UInt8(extendingOrTruncating: s.count),
438+
impl: .forward(d, s),
439+
codeUnitIndex:
440+
codeUnits.index(i.codeUnitIndex, offsetBy: numericCast(stride)))
441+
442+
case .invalid(let l):
443+
return Index(
444+
parsedLength: UInt8(extendingOrTruncating: l),
445+
impl: .forward(d, Encoding.ForwardDecoder.replacement),
446+
codeUnitIndex:
447+
codeUnits.index(i.codeUnitIndex, offsetBy: numericCast(stride)))
448+
449+
case .emptyInput:
450+
return endIndex
451+
}
452+
453+
case .reverse(_,_):
454+
fatalError("implement me")
455+
// The following has the right semantics but kills inlining. Needs a
456+
// refactor to be right.
457+
//
458+
// return index(after: _forwardIndex(atCodeUnit: i.codeUnitIndex))
459+
}
460+
}
461+
}
462+
289463
public protocol UnicodeEncoding {
290464
associatedtype CodeUnit
291465

@@ -359,22 +533,25 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
359533
extension Unicode.UTF8 : UnicodeEncoding {
360534
public struct ForwardDecoder {
361535
public typealias Buffer = _UIntBuffer<UInt32, UInt8>
362-
public typealias EncodedScalar = Buffer
363536
public init() { buffer = Buffer() }
364537
public var buffer: Buffer
365538
}
366539

367540
public struct ReverseDecoder {
368541
public typealias Buffer = _UIntBuffer<UInt32, UInt8>
369-
public typealias EncodedScalar = Buffer
370542
public init() { buffer = Buffer() }
371543
public var buffer: Buffer
372544
}
373545
}
374546

375547
extension UTF8.ReverseDecoder : _UTF8Decoder {
376548
public typealias CodeUnit = UInt8
549+
public typealias EncodedScalar = Buffer
377550

551+
public static var replacement : EncodedScalar {
552+
return EncodedScalar(_storage: 0xefbfbd, _bitCount: 24)
553+
}
554+
378555
public static func decodeOne(_ source: EncodedScalar) -> UnicodeScalar {
379556
let bits = source._storage
380557
switch source._bitCount {
@@ -464,6 +641,11 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
464641

465642
extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
466643
public typealias CodeUnit = UInt8
644+
public typealias EncodedScalar = Buffer
645+
646+
public static var replacement : EncodedScalar {
647+
return EncodedScalar(_storage: 0xbdbfef, _bitCount: 24)
648+
}
467649

468650
public // @testable
469651
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
@@ -642,6 +824,26 @@ func checkDecodeUTF8(
642824
_ expectedHead: [UInt32],
643825
_ expectedRepairedTail: [UInt32], _ utf8Str: [UInt8]
644826
) -> AssertionResult {
827+
var expected: [UnicodeScalar] = []
828+
do {
829+
var i = utf8Str.makeIterator()
830+
UTF8.ForwardDecoder.decode(&i, repairingIllFormedSequences: true) {
831+
expected.append($0)
832+
}
833+
}
834+
835+
let scalars = Unicode.DefaultScalarView<[UInt8], UTF8>(codeUnits: utf8Str)
836+
expectEqualSequence(expected, scalars)
837+
838+
do {
839+
var x = scalars.makeIterator()
840+
var j = scalars.startIndex
841+
while (j != scalars.endIndex) {
842+
expectEqual(x.next()!, scalars[j])
843+
j = scalars.index(after: j)
844+
}
845+
expectNil(x.next())
846+
}
645847
return checkDecodeUTF(UTF8.self, expectedHead, expectedRepairedTail, utf8Str)
646848
}
647849

@@ -2266,13 +2468,25 @@ public func run_UTF8Decode(_ N: Int) {
22662468
#if FORWARD
22672469
var it = string.makeIterator()
22682470
typealias D = UTF8.ForwardDecoder
2471+
D.decode(&it, repairingIllFormedSequences: true) { total = total &+ $0.value }
22692472
#elseif REVERSE
22702473
var it = string.reversed().makeIterator()
22712474
typealias D = UTF8.ReverseDecoder
2475+
D.decode(&it, repairingIllFormedSequences: true) { total = total &+ $0.value }
2476+
#elseif SEQUENCE
2477+
for s in Unicode.DefaultScalarView<[UInt8], UTF8>(codeUnits: string) {
2478+
total = total &+ s.value
2479+
}
2480+
#elseif COLLECTION
2481+
let scalars = Unicode.DefaultScalarView<[UInt8], UTF8>(codeUnits: string)
2482+
var i = scalars.startIndex
2483+
while i != scalars.endIndex {
2484+
total = total &+ scalars[i].value
2485+
i = scalars.index(after: i)
2486+
}
22722487
#else
22732488
Error_Unknown_Benchmark()
22742489
#endif
2275-
D.decode(&it, repairingIllFormedSequences: true) { total = total &+ $0.value }
22762490
#endif
22772491
}
22782492
}

0 commit comments

Comments
 (0)