Skip to content

Commit 1753e66

Browse files
author
Dave Abrahams
committed
[stdlib] UnicodeDecoders: further 10% speedup
Handling in bit counts rather than shuttling back and forth between bit counts and code unit counts saves a lot.
1 parent 65daf5d commit 1753e66

File tree

1 file changed

+18
-19
lines changed

1 file changed

+18
-19
lines changed

test/Prototypes/UnicodeDecoders.swift

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ public protocol UnicodeEncoding {
295295

296296

297297
public protocol _UTF8Decoder : UnicodeDecoder {
298-
func _parseNonASCII() -> (isValid: Bool, length: UInt8)
298+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8)
299299
var buffer: Buffer { get set }
300300
}
301301

@@ -333,23 +333,22 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
333333
} while buffer._bitCount < 32
334334

335335
// Find one unicode scalar.
336-
let (isValid, length) = _parseNonASCII()
337-
_sanityCheck(1...4 ~= length)
338-
_sanityCheck(length <= buffer.count)
336+
let (isValid, scalarBitCount) = _parseNonASCII()
337+
_sanityCheck(scalarBitCount % 8 == 0 && 1...4 ~= scalarBitCount / 8)
338+
_sanityCheck(scalarBitCount <= buffer._bitCount)
339339

340340
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
341-
let bitsConsumed = length << 3
342341
var encodedScalar = buffer
343-
encodedScalar._bitCount = bitsConsumed
342+
encodedScalar._bitCount = scalarBitCount
344343

345344
buffer._storage = UInt32(
346345
// widen to 64 bits so that we can empty the buffer in the 4-byte case
347-
extendingOrTruncating: UInt64(buffer._storage) &>> bitsConsumed)
346+
extendingOrTruncating: UInt64(buffer._storage) &>> scalarBitCount)
348347

349-
buffer._bitCount = buffer._bitCount &- bitsConsumed
348+
buffer._bitCount = buffer._bitCount &- scalarBitCount
350349

351350
if _fastPath(isValid) { return .valid(encodedScalar) }
352-
return .invalid(length: Int(length))
351+
return .invalid(length: Int(scalarBitCount &>> 3))
353352
}
354353
}
355354

@@ -393,13 +392,13 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
393392
}
394393

395394
public // @testable
396-
func _parseNonASCII() -> (isValid: Bool, length: UInt8) {
395+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
397396
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
398397
if buffer._storage & 0b0__1110_0000__1100_0000
399398
== 0b0__1100_0000__1000_0000 {
400399
// 2-byte sequence. Top 4 bits of decoded result must be nonzero
401400
let top4Bits = buffer._storage & 0b0__0001_1110__0000_0000
402-
if _fastPath(top4Bits != 0) { return (true, 2) }
401+
if _fastPath(top4Bits != 0) { return (true, 2*8) }
403402
}
404403
else if buffer._storage & 0b0__1111_0000__1100_0000__1100_0000
405404
== 0b0__1110_0000__1000_0000__1000_0000 {
@@ -408,7 +407,7 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
408407
let top5Bits = buffer._storage & 0b0__1111__0010_0000__0000_0000
409408
if _fastPath(
410409
top5Bits != 0 && top5Bits != 0b0__1101__0010_0000__0000_0000) {
411-
return (true, 3)
410+
return (true, 3*8)
412411
}
413412
}
414413
else if buffer._storage & 0b0__1111_1000__1100_0000__1100_0000__1100_0000
@@ -419,9 +418,9 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
419418
if _fastPath(
420419
top5bits != 0
421420
&& top5bits <= 0b0__0100__0000_0000__0000_0000__0000_0000
422-
) { return (true, 4) }
421+
) { return (true, 4*8) }
423422
}
424-
return (false, _invalidLength())
423+
return (false, _invalidLength() &* 8)
425424
}
426425

427426
/// Returns the length of the invalid sequence that ends with the LSB of
@@ -460,22 +459,22 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
460459
public typealias CodeUnit = UInt8
461460

462461
public // @testable
463-
func _parseNonASCII() -> (isValid: Bool, length: UInt8) {
462+
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
464463
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
465464

466465
if buffer._storage & 0b0__1100_0000__1110_0000
467466
== 0b0__1000_0000__1100_0000 {
468467
// 2-byte sequence. At least one of the top 4 bits of the decoded result
469468
// must be nonzero.
470-
if _fastPath(buffer._storage & 0b0_0001_1110 != 0) { return (true, 2) }
469+
if _fastPath(buffer._storage & 0b0_0001_1110 != 0) { return (true, 2*8) }
471470
}
472471
else if buffer._storage & 0b0__1100_0000__1100_0000__1111_0000
473472
== 0b0__1000_0000__1000_0000__1110_0000 {
474473
// 3-byte sequence. The top 5 bits of the decoded result must be nonzero
475474
// and not a surrogate
476475
let top5Bits = buffer._storage & 0b0___0010_0000__0000_1111
477476
if _fastPath(top5Bits != 0 && top5Bits != 0b0___0010_0000__0000_1101) {
478-
return (true, 3)
477+
return (true, 3*8)
479478
}
480479
}
481480
else if buffer._storage & 0b0__1100_0000__1100_0000__1100_0000__1111_1000
@@ -486,9 +485,9 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
486485
if _fastPath(
487486
top5bits != 0
488487
&& top5bits.byteSwapped <= 0b0__0000_0100__0000_0000
489-
) { return (true, 4) }
488+
) { return (true, 4*8) }
490489
}
491-
return (false, _invalidLength())
490+
return (false, _invalidLength() &* 8)
492491
}
493492

494493
/// Returns the length of the invalid sequence that starts with the LSB of

0 commit comments

Comments
 (0)