Skip to content

Commit 8f84db7

Browse files
committed
Simplify logic to parse IANA Charset names.
In response to: - #1286 (comment) - #1286 (comment) - #1286 (comment)
1 parent e674fa6 commit 8f84db7

File tree

1 file changed

+17
-118
lines changed

1 file changed

+17
-118
lines changed

Sources/FoundationEssentials/String/String+Encoding+Names.swift

Lines changed: 17 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -13,126 +13,28 @@
1313

1414
// MARK: - Private extensions for parsing encoding names
1515

16-
private extension Unicode.Scalar {
17-
/// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace".
18-
///
19-
/// Reference: https://infra.spec.whatwg.org/#ascii-whitespace
20-
var _isASCIIWhitespace: Bool {
21-
switch self.value {
22-
case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true
16+
private extension UTF8.CodeUnit {
17+
func _isASCIICaseinsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool {
18+
return switch self {
19+
case other, other._uppercased, other._lowercased: true
2320
default: false
2421
}
2522
}
2623
}
2724

2825
private extension String {
29-
var _trimmed: Substring.UnicodeScalarView {
30-
let scalars = self.unicodeScalars
31-
let isNonWhitespace: (Unicode.Scalar) -> Bool = { !$0._isASCIIWhitespace }
32-
guard let firstIndexOfNonWhitespace = scalars.firstIndex(where: isNonWhitespace),
33-
let lastIndexOfNonWhitespace = scalars.lastIndex(where: isNonWhitespace) else {
34-
return Substring.UnicodeScalarView()
35-
}
36-
return scalars[firstIndexOfNonWhitespace...lastIndexOfNonWhitespace]
37-
}
38-
}
39-
40-
/// A type that holds a `Unicode.Scalar` where its value is compared case-insensitively with others'
41-
/// _if the value is within ASCII range_.
42-
private struct ASCIICaseInsensitiveUnicodeScalar: Equatable,
43-
ExpressibleByUnicodeScalarLiteral {
44-
typealias UnicodeScalarLiteralType = Unicode.Scalar.UnicodeScalarLiteralType
45-
46-
let scalar: Unicode.Scalar
47-
48-
init(_ scalar: Unicode.Scalar) {
49-
assert(scalar.isASCII)
50-
self.scalar = scalar
51-
}
52-
53-
init(unicodeScalarLiteral value: Unicode.Scalar.UnicodeScalarLiteralType) {
54-
self.init(Unicode.Scalar(unicodeScalarLiteral: value))
55-
}
56-
57-
static func ==(
58-
lhs: ASCIICaseInsensitiveUnicodeScalar,
59-
rhs: ASCIICaseInsensitiveUnicodeScalar
60-
) -> Bool {
61-
if lhs.scalar == rhs.scalar {
62-
return true
63-
} else if ("A"..."Z").contains(lhs.scalar) {
64-
return lhs.scalar.value + 0x20 == rhs.scalar.value
65-
} else if ("a"..."z").contains(lhs.scalar) {
66-
return lhs.scalar.value - 0x20 == rhs.scalar.value
67-
}
68-
return false
69-
}
70-
}
71-
72-
/// A type to tokenize string for `String.Encoding` names.
73-
internal protocol StringEncodingNameTokenizer: ~Copyable {
74-
associatedtype Token: Equatable
75-
init(name: String)
76-
mutating func nextToken() throws -> Token?
77-
}
78-
79-
extension StringEncodingNameTokenizer where Self: ~Copyable {
80-
mutating func hasEqualTokens(with other: consuming Self) throws -> Bool {
81-
while let myToken = try self.nextToken() {
82-
guard let otherToken = try other.nextToken(),
83-
myToken == otherToken else {
26+
func _isASCIICaseinsensitivelyEqual(to other: String) -> Bool {
27+
let (myUTF8, otherUTF8) = (self.utf8, other.utf8)
28+
var (myIndex, otherIndex) = (myUTF8.startIndex, otherUTF8.startIndex)
29+
while myIndex < myUTF8.endIndex && otherIndex < otherUTF8.endIndex {
30+
guard myUTF8[myIndex]._isASCIICaseinsensitivelyEqual(to: otherUTF8[otherIndex]) else {
8431
return false
8532
}
86-
}
87-
return try other.nextToken() == nil
88-
}
89-
}
90-
91-
92-
/// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s.
93-
private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable {
94-
typealias Token = ASCIICaseInsensitiveUnicodeScalar
9533

96-
enum Error: Swift.Error {
97-
case nonASCII
98-
}
99-
100-
let scalars: Substring.UnicodeScalarView
101-
102-
var _currentIndex: Substring.UnicodeScalarView.Index
103-
104-
init(name: String) {
105-
self.scalars = name._trimmed
106-
self._currentIndex = scalars.startIndex
107-
}
108-
109-
mutating func nextToken() throws -> Token? {
110-
guard _currentIndex < scalars.endIndex else {
111-
return nil
112-
}
113-
let scalar = scalars[_currentIndex]
114-
guard scalar.isASCII else { throw Error.nonASCII }
115-
defer {
116-
scalars.formIndex(after: &_currentIndex)
117-
}
118-
return ASCIICaseInsensitiveUnicodeScalar(scalar)
119-
}
120-
}
121-
122-
123-
private extension String {
124-
func isEqual<T>(
125-
to other: String,
126-
tokenizedBy tokenizer: T.Type
127-
) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable {
128-
do {
129-
var myTokenizer = T(name: self)
130-
let otherTokenizer = T(name: other)
131-
return try myTokenizer.hasEqualTokens(with: otherTokenizer)
132-
} catch {
133-
// Any errors imply that `self` or `other` contains invalid characters.
134-
return false
34+
myUTF8.formIndex(after: &myIndex)
35+
otherUTF8.formIndex(after: &otherIndex)
13536
}
37+
return myIndex == myUTF8.endIndex && otherIndex == otherUTF8.endIndex
13638
}
13739
}
13840

@@ -160,19 +62,16 @@ internal struct IANACharset {
16062
self.aliases = aliases
16163
}
16264

163-
func matches<T>(
164-
_ string: String,
165-
tokenizedBy tokenizer: T.Type
166-
) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable {
65+
func matches(_ string: String) -> Bool {
16766
if let preferredMIMEName = self.preferredMIMEName,
168-
preferredMIMEName.isEqual(to: string, tokenizedBy: tokenizer) {
67+
preferredMIMEName._isASCIICaseinsensitivelyEqual(to: string) {
16968
return true
17069
}
171-
if name.isEqual(to: string, tokenizedBy: tokenizer) {
70+
if name._isASCIICaseinsensitivelyEqual(to: string) {
17271
return true
17372
}
17473
for alias in aliases {
175-
if alias.isEqual(to: string, tokenizedBy: tokenizer) {
74+
if alias._isASCIICaseinsensitivelyEqual(to: string) {
17675
return true
17776
}
17877
}
@@ -249,7 +148,7 @@ extension String.Encoding {
249148
guard let ianaCharset = encoding._ianaCharset else {
250149
continue
251150
}
252-
if ianaCharset.matches(charsetName, tokenizedBy: ASCIICaseInsensitiveTokenizer.self) {
151+
if ianaCharset.matches(charsetName) {
253152
return encoding
254153
}
255154
}

0 commit comments

Comments
 (0)