Skip to content

Commit cbf157f

Browse files
author
Lance Parker
authored
[stdlib]Unify String hashing implementation (swiftlang#14921)
* Add partial range subscripts to _UnmanagedOpaqueString * Use SipHash13+_NormalizedCodeUnitIterator for String hashes on all platforms * Remove unecessary collation algorithm shims * Pass the buffer to the SipHasher for ASCII * Hash the ascii parts of UTF16 strings the same way we hash pure ascii strings * De-dupe some code that can be shared between _UnmanagedOpaqueString and _UnmanagedString<UInt16> * ASCII strings now hash consistently for in hashASCII() and hashUTF16() * Fix zalgo comparison regression * Use hasher * Fix crash when appending to an empty _FixedArray * Compact ASCII characters into a single UInt64 for hashing * String: Switch to _hash(into:)-based hashing This should speed up String hashing quite a bit, as doing it through hashValue involves two rounds of SipHash nested in each other. * Remove obsolete workaround for ARC traffic * Ditch _FixedArray<UInt8> in favor of _UIntBuffer<UInt64, UInt8> * Bad rebase remnants * Fix failing benchmarks * michael's feedback * clarify the comment about nul-terminated string hashes
1 parent f5d43e2 commit cbf157f

File tree

9 files changed

+215
-388
lines changed

9 files changed

+215
-388
lines changed

stdlib/public/SwiftShims/UnicodeShims.h

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,6 @@ SWIFT_RUNTIME_STDLIB_INTERFACE
6262
const __swift_uint16_t *
6363
_swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrix;
6464

65-
SWIFT_RUNTIME_STDLIB_INTERFACE
66-
void *_swift_stdlib_unicodeCollationIterator_create(
67-
const __swift_uint16_t *Str,
68-
__swift_uint32_t Length);
69-
70-
SWIFT_RUNTIME_STDLIB_INTERFACE
71-
__swift_int32_t _swift_stdlib_unicodeCollationIterator_next(
72-
void *CollationIterator, __swift_bool *HitEnd);
73-
74-
SWIFT_RUNTIME_STDLIB_INTERFACE
75-
void _swift_stdlib_unicodeCollationIterator_delete(
76-
void *CollationIterator);
77-
78-
SWIFT_RUNTIME_STDLIB_INTERFACE
79-
const __swift_int32_t *_swift_stdlib_unicode_getASCIICollationTable();
80-
8165
SWIFT_RUNTIME_STDLIB_INTERFACE
8266
__swift_int32_t _swift_stdlib_unicode_strToUpper(
8367
__swift_uint16_t *Destination, __swift_int32_t DestinationCapacity,

stdlib/public/core/FixedArray.swift.gyb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ extension _FixedArray${N} {
127127
@_versioned
128128
internal mutating func append(_ newElement: T) {
129129
_sanityCheck(count < capacity)
130-
self[count] = newElement
131130
_count += 1
131+
self[count-1] = newElement
132132
}
133133
}
134134

stdlib/public/core/NormalizedCodeUnitIterator.swift

Lines changed: 21 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@ struct _NormalizedCodeUnitIterator: IteratorProtocol {
2323

2424
typealias CodeUnit = UInt16
2525

26-
init(_ opaqueString: _UnmanagedOpaqueString, startIndex: Int = 0) {
27-
source = _UnmanagedOpaqueStringSource(opaqueString, start: startIndex)
28-
}
29-
30-
init(_ unmanagedString: _UnmanagedString<UInt16>, startIndex: Int = 0) {
31-
source = _UnmanagedStringSource(unmanagedString, start: startIndex)
26+
init<Source: BidirectionalCollection>
27+
(_ collection: Source)
28+
where Source.Element == UInt16, Source.SubSequence == Source
29+
{
30+
source = _CollectionSource(collection)
3231
}
3332

3433
init(_ guts: _StringGuts, _ range: Range<Int>, startIndex: Int = 0) {
@@ -60,23 +59,27 @@ struct _NormalizedCodeUnitIterator: IteratorProtocol {
6059
}
6160
}
6261

63-
struct _UnmanagedOpaqueStringSource: _SegmentSource {
62+
struct _CollectionSource<Source: BidirectionalCollection>: _SegmentSource
63+
where Source.Element == UInt16, Source.SubSequence == Source
64+
{
6465
var remaining: Int {
65-
return opaqueString.count - index
66+
return collection.distance(from: index, to: collection.endIndex)
6667
}
67-
var opaqueString: _UnmanagedOpaqueString
68-
var index: Int
68+
var collection: Source
69+
var index: Source.Index
6970

70-
init(_ opaqueString: _UnmanagedOpaqueString, start: Int = 0) {
71-
self.opaqueString = opaqueString
72-
index = start
71+
init(_ collection: Source) {
72+
self.collection = collection
73+
index = collection.startIndex
7374
}
7475

76+
@_specialize(where Source == _UnmanagedString<UInt16>)
77+
@_specialize(where Source == _UnmanagedOpaqueString)
7578
mutating func tryFill(buffer: UnsafeMutableBufferPointer<UInt16>) -> Int? {
7679
var bufferIndex = 0
7780
let originalIndex = index
7881
repeat {
79-
guard index < opaqueString.count else {
82+
guard index != collection.endIndex else {
8083
break
8184
}
8285

@@ -86,49 +89,11 @@ struct _NormalizedCodeUnitIterator: IteratorProtocol {
8689
return nil
8790
}
8891

89-
let cu = opaqueString[index]
92+
let cu = collection[index]
9093
buffer[bufferIndex] = cu
91-
index += 1
94+
index = collection.index(after: index)
9295
bufferIndex += 1
93-
} while !opaqueString.hasNormalizationBoundary(after: index - 1)
94-
95-
return bufferIndex
96-
}
97-
}
98-
99-
struct _UnmanagedStringSource: _SegmentSource {
100-
var remaining: Int {
101-
return unmanagedString.count - index
102-
}
103-
104-
var unmanagedString: _UnmanagedString<UInt16>
105-
var index: Int
106-
107-
init(_ unmanagedString: _UnmanagedString<UInt16>, start: Int = 0) {
108-
self.unmanagedString = unmanagedString
109-
index = start
110-
}
111-
112-
mutating func tryFill(buffer: UnsafeMutableBufferPointer<UInt16>) -> Int? {
113-
var bufferIndex = 0
114-
let originalIndex = index
115-
repeat {
116-
guard index < unmanagedString.count else {
117-
break
118-
}
119-
120-
guard bufferIndex < buffer.count else {
121-
//The buffer isn't big enough for the current segment
122-
index = originalIndex
123-
return nil
124-
}
125-
126-
let cu = unmanagedString[index]
127-
buffer[bufferIndex] = cu
128-
index += 1
129-
bufferIndex += 1
130-
} while unmanagedString.hasNormalizationBoundary(
131-
after: index - 1) == false
96+
} while !collection.hasNormalizationBoundary(after: collection.index(before: index))
13297

13398
return bufferIndex
13499
}
@@ -266,4 +231,4 @@ extension _SegmentSource {
266231
) -> Int? {
267232
return tryFill(buffer: _castOutputBuffer(buffer))
268233
}
269-
}
234+
}

0 commit comments

Comments
 (0)