Skip to content

Commit d8533a2

Browse files
authored
SF-0033: Implement String.Encoding.ianaName and String.Encoding(ianaName:). (#1286)
* Import implementation for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl * Import tests for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl/blob/0.4.0/Tests/StringEncodingNameImplTests/StringEncodingNameParserTests.swift * Remove dead code in terms of the current proposal. * Use `Testing` for String Encoding Names tests. * NFC: Fix indentation in "String+Encoding+Names.swift". * SF-0033: Adjust comments/attributes to match the accepted proposal. * Auto-generate Swift source code for IANA Charset names. * Remove unnecessary `@inlinable`. * Simplify `String.init(ianaName:)`. * Add new files related to SF-0033 to CMakeLists.txt. * Rewrite script in Swift instead of Python. In response to: #1286 (comment) * Simplify logic to parse IANA Charset names. In response to: - #1286 (comment) - #1286 (comment) - #1286 (comment) * Fix spelling of functions for "case-insensitively". In response to: - #1286 (comment) * Remove redundant nested function in `String.Encoding(ianaName:)`. In response to: - #1286 (comment)
1 parent b7be535 commit d8533a2

File tree

6 files changed

+690
-0
lines changed

6 files changed

+690
-0
lines changed

Sources/FoundationEssentials/String/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
target_sources(FoundationEssentials PRIVATE
1616
BidirectionalCollection.swift
1717
BuiltInUnicodeScalarSet.swift
18+
IANACharsetNames.swift
1819
RegexPatternCache.swift
1920
String+Bridging.swift
2021
String+Comparison.swift
2122
String+Encoding.swift
23+
String+Encoding+Names.swift
2224
String+EndianAdaptorSequence.swift
2325
String+Essentials.swift
2426
String+IO.swift
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
14+
// WARNING: DO NOT EDIT THIS FILE DIRECTLY.
15+
// This is auto-generated by `update-iana-charset-names`.
16+
17+
18+
extension IANACharset {
19+
/// IANA Charset `US-ASCII`.
20+
static let usASCII = IANACharset(
21+
preferredMIMEName: "US-ASCII",
22+
name: "US-ASCII",
23+
aliases: [
24+
"iso-ir-6",
25+
"ANSI_X3.4-1968",
26+
"ANSI_X3.4-1986",
27+
"ISO_646.irv:1991",
28+
"ISO646-US",
29+
"US-ASCII",
30+
"us",
31+
"IBM367",
32+
"cp367",
33+
"csASCII",
34+
]
35+
)
36+
37+
/// IANA Charset `ISO-8859-1`.
38+
static let iso8859_1 = IANACharset(
39+
preferredMIMEName: "ISO-8859-1",
40+
name: "ISO_8859-1:1987",
41+
aliases: [
42+
"iso-ir-100",
43+
"ISO_8859-1",
44+
"ISO-8859-1",
45+
"latin1",
46+
"l1",
47+
"IBM819",
48+
"CP819",
49+
"csISOLatin1",
50+
]
51+
)
52+
53+
/// IANA Charset `ISO-8859-2`.
54+
static let iso8859_2 = IANACharset(
55+
preferredMIMEName: "ISO-8859-2",
56+
name: "ISO_8859-2:1987",
57+
aliases: [
58+
"iso-ir-101",
59+
"ISO_8859-2",
60+
"ISO-8859-2",
61+
"latin2",
62+
"l2",
63+
"csISOLatin2",
64+
]
65+
)
66+
67+
/// IANA Charset `Shift_JIS`.
68+
static let shiftJIS = IANACharset(
69+
preferredMIMEName: "Shift_JIS",
70+
name: "Shift_JIS",
71+
aliases: [
72+
"MS_Kanji",
73+
"csShiftJIS",
74+
]
75+
)
76+
77+
/// IANA Charset `EUC-JP`.
78+
static let eucJP = IANACharset(
79+
preferredMIMEName: "EUC-JP",
80+
name: "Extended_UNIX_Code_Packed_Format_for_Japanese",
81+
aliases: [
82+
"csEUCPkdFmtJapanese",
83+
"EUC-JP",
84+
]
85+
)
86+
87+
/// IANA Charset `ISO-2022-JP`.
88+
static let iso2022JP = IANACharset(
89+
preferredMIMEName: "ISO-2022-JP",
90+
name: "ISO-2022-JP",
91+
aliases: [
92+
"csISO2022JP",
93+
]
94+
)
95+
96+
/// IANA Charset `UTF-8`.
97+
static let utf8 = IANACharset(
98+
preferredMIMEName: nil,
99+
name: "UTF-8",
100+
aliases: [
101+
"csUTF8",
102+
]
103+
)
104+
105+
/// IANA Charset `UTF-16BE`.
106+
static let utf16BE = IANACharset(
107+
preferredMIMEName: nil,
108+
name: "UTF-16BE",
109+
aliases: [
110+
"csUTF16BE",
111+
]
112+
)
113+
114+
/// IANA Charset `UTF-16LE`.
115+
static let utf16LE = IANACharset(
116+
preferredMIMEName: nil,
117+
name: "UTF-16LE",
118+
aliases: [
119+
"csUTF16LE",
120+
]
121+
)
122+
123+
/// IANA Charset `UTF-16`.
124+
static let utf16 = IANACharset(
125+
preferredMIMEName: nil,
126+
name: "UTF-16",
127+
aliases: [
128+
"csUTF16",
129+
]
130+
)
131+
132+
/// IANA Charset `UTF-32`.
133+
static let utf32 = IANACharset(
134+
preferredMIMEName: nil,
135+
name: "UTF-32",
136+
aliases: [
137+
"csUTF32",
138+
]
139+
)
140+
141+
/// IANA Charset `UTF-32BE`.
142+
static let utf32BE = IANACharset(
143+
preferredMIMEName: nil,
144+
name: "UTF-32BE",
145+
aliases: [
146+
"csUTF32BE",
147+
]
148+
)
149+
150+
/// IANA Charset `UTF-32LE`.
151+
static let utf32LE = IANACharset(
152+
preferredMIMEName: nil,
153+
name: "UTF-32LE",
154+
aliases: [
155+
"csUTF32LE",
156+
]
157+
)
158+
159+
/// IANA Charset `macintosh`.
160+
static let macintosh = IANACharset(
161+
preferredMIMEName: nil,
162+
name: "macintosh",
163+
aliases: [
164+
"mac",
165+
"csMacintosh",
166+
]
167+
)
168+
169+
/// IANA Charset `windows-1250`.
170+
static let windows1250 = IANACharset(
171+
preferredMIMEName: nil,
172+
name: "windows-1250",
173+
aliases: [
174+
"cswindows1250",
175+
]
176+
)
177+
178+
/// IANA Charset `windows-1251`.
179+
static let windows1251 = IANACharset(
180+
preferredMIMEName: nil,
181+
name: "windows-1251",
182+
aliases: [
183+
"cswindows1251",
184+
]
185+
)
186+
187+
/// IANA Charset `windows-1252`.
188+
static let windows1252 = IANACharset(
189+
preferredMIMEName: nil,
190+
name: "windows-1252",
191+
aliases: [
192+
"cswindows1252",
193+
]
194+
)
195+
196+
/// IANA Charset `windows-1253`.
197+
static let windows1253 = IANACharset(
198+
preferredMIMEName: nil,
199+
name: "windows-1253",
200+
aliases: [
201+
"cswindows1253",
202+
]
203+
)
204+
205+
/// IANA Charset `windows-1254`.
206+
static let windows1254 = IANACharset(
207+
preferredMIMEName: nil,
208+
name: "windows-1254",
209+
aliases: [
210+
"cswindows1254",
211+
]
212+
)
213+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
14+
// MARK: - Private extensions for parsing encoding names
15+
16+
private extension UTF8.CodeUnit {
17+
func _isASCIICaseInsensitivelyEqual(to other: UTF8.CodeUnit) -> Bool {
18+
return switch self {
19+
case other, other._uppercased, other._lowercased: true
20+
default: false
21+
}
22+
}
23+
}
24+
25+
private extension String {
26+
func _isASCIICaseInsensitivelyEqual(to other: String) -> Bool {
27+
let (myUTF8, otherUTF8) = (self.utf8, other.utf8)
28+
var (myIndex, otherIndex) = (myUTF8.startIndex, otherUTF8.startIndex)
29+
while myIndex < myUTF8.endIndex && otherIndex < otherUTF8.endIndex {
30+
guard myUTF8[myIndex]._isASCIICaseInsensitivelyEqual(to: otherUTF8[otherIndex]) else {
31+
return false
32+
}
33+
34+
myUTF8.formIndex(after: &myIndex)
35+
otherUTF8.formIndex(after: &otherIndex)
36+
}
37+
return myIndex == myUTF8.endIndex && otherIndex == otherUTF8.endIndex
38+
}
39+
}
40+
41+
42+
// MARK: - IANA Charset Names
43+
44+
/// Info about IANA Charset.
45+
internal struct IANACharset {
46+
/// Preferred MIME Name
47+
let preferredMIMEName: String?
48+
49+
/// The name of this charset
50+
let name: String
51+
52+
/// The aliases of this charset
53+
let aliases: Array<String>
54+
55+
var representativeName: String {
56+
return preferredMIMEName ?? name
57+
}
58+
59+
init(preferredMIMEName: String?, name: String, aliases: Array<String>) {
60+
self.preferredMIMEName = preferredMIMEName
61+
self.name = name
62+
self.aliases = aliases
63+
}
64+
65+
func matches(_ string: String) -> Bool {
66+
if let preferredMIMEName = self.preferredMIMEName,
67+
preferredMIMEName._isASCIICaseInsensitivelyEqual(to: string) {
68+
return true
69+
}
70+
if name._isASCIICaseInsensitivelyEqual(to: string) {
71+
return true
72+
}
73+
for alias in aliases {
74+
if alias._isASCIICaseInsensitivelyEqual(to: string) {
75+
return true
76+
}
77+
}
78+
return false
79+
}
80+
}
81+
82+
83+
// MARK: - `String.Encoding` Names
84+
85+
extension String.Encoding {
86+
private var _ianaCharset: IANACharset? {
87+
switch self {
88+
case .utf8: .utf8
89+
case .ascii: .usASCII
90+
case .japaneseEUC: .eucJP
91+
case .isoLatin1: .iso8859_1
92+
case .shiftJIS: .shiftJIS
93+
case .isoLatin2: .iso8859_2
94+
case .unicode: .utf16
95+
case .windowsCP1251: .windows1251
96+
case .windowsCP1252: .windows1252
97+
case .windowsCP1253: .windows1253
98+
case .windowsCP1254: .windows1254
99+
case .windowsCP1250: .windows1250
100+
case .iso2022JP: .iso2022JP
101+
case .macOSRoman: .macintosh
102+
case .utf16BigEndian: .utf16BE
103+
case .utf16LittleEndian: .utf16LE
104+
case .utf32: .utf32
105+
case .utf32BigEndian: .utf32BE
106+
case .utf32LittleEndian: .utf32LE
107+
default: nil
108+
}
109+
}
110+
111+
/// The name of this encoding that is compatible with the one of the IANA registry "charset".
112+
@available(FoundationPreview 6.3, *)
113+
public var ianaName: String? {
114+
return _ianaCharset?.representativeName
115+
}
116+
117+
/// Creates an instance from the name of the IANA registry "charset".
118+
///
119+
/// - Note: The given name is compared to each IANA "charset" name
120+
/// with ASCII case-insensitive collation
121+
/// to determine which encoding is suitable.
122+
@available(FoundationPreview 6.3, *)
123+
public init?(ianaName charsetName: String) {
124+
let possibilities: [String.Encoding] = [
125+
.utf8,
126+
.ascii,
127+
.japaneseEUC,
128+
.isoLatin1,
129+
.shiftJIS,
130+
.isoLatin2,
131+
.unicode, // .utf16
132+
.windowsCP1251,
133+
.windowsCP1252,
134+
.windowsCP1253,
135+
.windowsCP1254,
136+
.windowsCP1250,
137+
.iso2022JP,
138+
.macOSRoman,
139+
.utf16BigEndian,
140+
.utf16LittleEndian,
141+
.utf32,
142+
.utf32BigEndian,
143+
.utf32LittleEndian,
144+
]
145+
146+
for encoding in possibilities {
147+
if encoding._ianaCharset!.matches(charsetName) {
148+
self = encoding
149+
return
150+
}
151+
}
152+
return nil
153+
}
154+
}
155+

0 commit comments

Comments
 (0)