Skip to content

Commit acae3d2

Browse files
authored
Implement isoLatin1 and macOSRoman encoding (#743)
* Implement isoLatin1 and macOSRoman encoding * Limit new behavior to non-framework build
1 parent 5463e8e commit acae3d2

File tree

5 files changed

+325
-3
lines changed

5 files changed

+325
-3
lines changed

Sources/FoundationEssentials/String/String+IO.swift

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,38 @@
1414
internal import _ForSwiftFoundation
1515
#endif
1616

17+
internal import _FoundationCShims
18+
1719
fileprivate let stringEncodingAttributeName = "com.apple.TextEncoding"
1820

21+
private struct ExtendingToUTF16Sequence<Base: Sequence<UInt8>> : Sequence {
22+
typealias Element = UInt16
23+
24+
struct Iterator : IteratorProtocol {
25+
private var base: Base.Iterator
26+
27+
init(_ base: Base.Iterator) {
28+
self.base = base
29+
}
30+
31+
mutating func next() -> Element? {
32+
guard let value = base.next() else { return nil }
33+
return UInt16(value)
34+
}
35+
}
36+
37+
private let base: Base
38+
39+
init(_ base: Base) {
40+
self.base = base
41+
}
42+
43+
func makeIterator() -> Iterator {
44+
Iterator(base.makeIterator())
45+
}
46+
}
47+
48+
1949
@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
2050
extension String {
2151
/// Returns a `String` initialized by converting given `data` into
@@ -139,6 +169,31 @@ extension String {
139169
} else {
140170
return nil
141171
}
172+
#if !FOUNDATION_FRAMEWORK
173+
case .isoLatin1:
174+
guard bytes.allSatisfy(\.isValidISOLatin1) else {
175+
return nil
176+
}
177+
// isoLatin1 is an 8-bit encoding that represents a subset of UTF-16
178+
// Map to 16-bit values and decode as UTF-16
179+
self.init(_validating: ExtendingToUTF16Sequence(bytes), as: UTF16.self)
180+
case .macOSRoman:
181+
func buildString(_ bytes: UnsafeBufferPointer<UInt8>) -> String {
182+
String(unsafeUninitializedCapacity: bytes.count * 3) { buffer in
183+
var next = 0
184+
for byte in bytes {
185+
if Unicode.ASCII.isASCII(byte) {
186+
buffer.initializeElement(at: next, to: byte)
187+
next += 1
188+
} else {
189+
next = buffer.suffix(from: next).initialize(fromContentsOf: byte.macRomanNonASCIIAsUTF8)
190+
}
191+
}
192+
return next
193+
}
194+
}
195+
self = bytes.withContiguousStorageIfAvailable(buildString) ?? Array(bytes).withUnsafeBufferPointer(buildString)
196+
#endif
142197
default:
143198
#if FOUNDATION_FRAMEWORK
144199
// In the framework, we can fall back to NS/CFString to handle more esoteric encodings.
@@ -261,6 +316,8 @@ internal func encodingFromDataForExtendedAttribute(_ value: Data) -> String.Enco
261316
}
262317
#else
263318
foundEncoding = switch enc {
319+
case 0x0: .macOSRoman
320+
case 0x0201: .isoLatin1
264321
case 0x0600: .ascii
265322
case 0x08000100: .utf8
266323
case 0x0100: .utf16
@@ -304,6 +361,8 @@ internal func encodingFromDataForExtendedAttribute(_ value: Data) -> String.Enco
304361
case "utf-32": return .utf32
305362
case "utf-32be": return .utf32BigEndian
306363
case "utf-32le": return .utf32LittleEndian
364+
case "iso-8859-1": return .isoLatin1
365+
case "macintosh": return .macOSRoman
307366
default: return nil // Unknown encoding value
308367
}
309368
#endif
@@ -322,6 +381,8 @@ internal func extendedAttributeData(for encoding: String.Encoding) -> Data? {
322381
let encodingName = CFStringConvertEncodingToIANACharSetName(cfEncoding)
323382
#else
324383
let cfEncoding : UInt? = switch encoding {
384+
case .macOSRoman: 0x0
385+
case .isoLatin1: 0x0201
325386
case .ascii: 0x0600
326387
case .utf8: 0x08000100
327388
case .utf16: 0x0100
@@ -346,6 +407,8 @@ internal func extendedAttributeData(for encoding: String.Encoding) -> Data? {
346407
case .utf32: "utf-32"
347408
case .utf32BigEndian: "utf-32be"
348409
case .utf32LittleEndian: "utf-32le"
410+
case .macOSRoman: "macintosh"
411+
case .isoLatin1: "iso-8859-1"
349412
default: nil
350413
}
351414
#endif

Sources/FoundationEssentials/String/StringProtocol+Essentials.swift

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,61 @@ import Darwin
1919
import Glibc
2020
#endif
2121

22+
internal import _FoundationCShims
23+
24+
extension BinaryInteger {
25+
var isValidISOLatin1: Bool {
26+
(0x20 <= self && self <= 0x7E) || (0xA0 <= self && self <= 0xFF)
27+
}
28+
}
29+
30+
extension UInt8 {
31+
private typealias UTF8Representation = (UInt8, UInt8, UInt8)
32+
private static func withMacRomanMap<R>(_ body: (UnsafeBufferPointer<UTF8Representation>) -> R) -> R {
33+
withUnsafePointer(to: _stringshims_macroman_mapping) {
34+
$0.withMemoryRebound(to: UTF8Representation.self, capacity: Int(_STRINGSHIMS_MACROMAN_MAP_SIZE)) {
35+
body(UnsafeBufferPointer(start: $0, count: Int(_STRINGSHIMS_MACROMAN_MAP_SIZE)))
36+
}
37+
}
38+
}
39+
40+
var macRomanNonASCIIAsUTF8: some Collection<UInt8> {
41+
assert(!Unicode.ASCII.isASCII(self))
42+
return Self.withMacRomanMap { map in
43+
let utf8Rep = map[Int(self) - 128]
44+
if utf8Rep.2 == 0 {
45+
return [utf8Rep.0, utf8Rep.1]
46+
} else {
47+
return [utf8Rep.0, utf8Rep.1, utf8Rep.2]
48+
}
49+
}
50+
}
51+
52+
init?(macRomanFor scalar: UnicodeScalar) {
53+
guard !scalar.isASCII else {
54+
self.init(scalar.value)
55+
return
56+
}
57+
58+
let utf8 = Array(scalar.utf8)
59+
guard utf8.count <= 3 else {
60+
return nil
61+
}
62+
let tuple = (utf8[0], utf8[1], utf8.count == 2 ? 0 : utf8[2])
63+
64+
let value: UInt8? = Self.withMacRomanMap { map in
65+
if let found = map.firstIndex(where: { $0 == tuple }) {
66+
return UInt8(found) + 128
67+
} else {
68+
return nil
69+
}
70+
}
71+
72+
guard let value else { return nil }
73+
self = value
74+
}
75+
}
76+
2277
// These provides concrete implementations for String and Substring, enhancing performance over generic StringProtocol.
2378

2479
@available(FoundationPreview 0.4, *)
@@ -156,6 +211,26 @@ extension String {
156211
}
157212

158213
return data + swapped
214+
#if !FOUNDATION_FRAMEWORK
215+
case .isoLatin1:
216+
return try? Data(capacity: self.utf16.count) { buffer in
217+
for scalar in self.utf16 {
218+
guard scalar.isValidISOLatin1 else {
219+
throw CocoaError(.fileWriteInapplicableStringEncoding)
220+
}
221+
buffer.appendElement(UInt8(scalar & 0xFF))
222+
}
223+
}
224+
case .macOSRoman:
225+
return try? Data(capacity: self.unicodeScalars.count) { buffer in
226+
for scalar in self.unicodeScalars {
227+
guard let value = UInt8(macRomanFor: scalar) else {
228+
throw CocoaError(.fileWriteInapplicableStringEncoding)
229+
}
230+
buffer.appendElement(value)
231+
}
232+
}
233+
#endif
159234
default:
160235
#if FOUNDATION_FRAMEWORK
161236
// Other encodings, defer to the CoreFoundation implementation

Sources/_FoundationCShims/include/string_shims.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define CSHIMS_STRING_H
1515

1616
#include "_CShimsMacros.h"
17+
#include "_CStdlib.h"
1718

1819
#if __has_include(<locale.h>)
1920
#include <locale.h>
@@ -42,6 +43,9 @@ INTERNAL double _stringshims_strtod_l(const char * _Nullable __restrict nptr, ch
4243

4344
INTERNAL float _stringshims_strtof_l(const char * _Nullable __restrict nptr, char * _Nullable * _Nullable __restrict endptr, locale_t _Nullable loc);
4445

46+
#define _STRINGSHIMS_MACROMAN_MAP_SIZE 129
47+
INTERNAL const uint8_t _stringshims_macroman_mapping[_STRINGSHIMS_MACROMAN_MAP_SIZE][3];
48+
4549
#ifdef __cplusplus
4650
}
4751
#endif

Sources/_FoundationCShims/string_shims.c

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,135 @@ float _stringshims_strtof_l(const char * _Nullable restrict nptr,
102102
return result;
103103
#endif
104104
}
105+
106+
const uint8_t _stringshims_macroman_mapping[_STRINGSHIMS_MACROMAN_MAP_SIZE][3] = {
107+
{ 0xC2, 0xA0, 0x00 }, /* NO-BREAK SPACE */
108+
{ 0xC2, 0xA1, 0x00 }, /* INVERTED EXCLAMATION MARK */
109+
{ 0xC2, 0xA2, 0x00 }, /* CENT SIGN */
110+
{ 0xC2, 0xA3, 0x00 }, /* POUND SIGN */
111+
{ 0xC2, 0xA5, 0x00 }, /* YEN SIGN */
112+
{ 0xC2, 0xA7, 0x00 }, /* SECTION SIGN */
113+
{ 0xC2, 0xA8, 0x00 }, /* DIAERESIS */
114+
{ 0xC2, 0xA9, 0x00 }, /* COPYRIGHT SIGN */
115+
{ 0xC2, 0xAA, 0x00 }, /* FEMININE ORDINAL INDICATOR */
116+
{ 0xC2, 0xAB, 0x00 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
117+
{ 0xC2, 0xAC, 0x00 }, /* NOT SIGN */
118+
{ 0xC2, 0xAE, 0x00 }, /* REGISTERED SIGN */
119+
{ 0xC2, 0xAF, 0x00 }, /* MACRON */
120+
{ 0xC2, 0xB0, 0x00 }, /* DEGREE SIGN */
121+
{ 0xC2, 0xB1, 0x00 }, /* PLUS-MINUS SIGN */
122+
{ 0xC2, 0xB4, 0x00 }, /* ACUTE ACCENT */
123+
{ 0xC2, 0xB5, 0x00 }, /* MICRO SIGN */
124+
{ 0xC2, 0xB6, 0x00 }, /* PILCROW SIGN */
125+
{ 0xC2, 0xB7, 0x00 }, /* MIDDLE DOT */
126+
{ 0xC2, 0xB8, 0x00 }, /* CEDILLA */
127+
{ 0xC2, 0xBA, 0x00 }, /* MASCULINE ORDINAL INDICATOR */
128+
{ 0xC2, 0xBB, 0x00 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
129+
{ 0xC2, 0xBF, 0x00 }, /* INVERTED QUESTION MARK */
130+
{ 0xC3, 0x80, 0x00 }, /* LATIN CAPITAL LETTER A WITH GRAVE */
131+
{ 0xC3, 0x81, 0x00 }, /* LATIN CAPITAL LETTER A WITH ACUTE */
132+
{ 0xC3, 0x82, 0x00 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
133+
{ 0xC3, 0x83, 0x00 }, /* LATIN CAPITAL LETTER A WITH TILDE */
134+
{ 0xC3, 0x84, 0x00 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
135+
{ 0xC3, 0x85, 0x00 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
136+
{ 0xC3, 0x86, 0x00 }, /* LATIN CAPITAL LIGATURE AE */
137+
{ 0xC3, 0x87, 0x00 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */
138+
{ 0xC3, 0x88, 0x00 }, /* LATIN CAPITAL LETTER E WITH GRAVE */
139+
{ 0xC3, 0x89, 0x00 }, /* LATIN CAPITAL LETTER E WITH ACUTE */
140+
{ 0xC3, 0x8A, 0x00 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
141+
{ 0xC3, 0x8B, 0x00 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
142+
{ 0xC3, 0x8C, 0x00 }, /* LATIN CAPITAL LETTER I WITH GRAVE */
143+
{ 0xC3, 0x8D, 0x00 }, /* LATIN CAPITAL LETTER I WITH ACUTE */
144+
{ 0xC3, 0x8E, 0x00 }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
145+
{ 0xC3, 0x8F, 0x00 }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
146+
{ 0xC3, 0x91, 0x00 }, /* LATIN CAPITAL LETTER N WITH TILDE */
147+
{ 0xC3, 0x92, 0x00 }, /* LATIN CAPITAL LETTER O WITH GRAVE */
148+
{ 0xC3, 0x93, 0x00 }, /* LATIN CAPITAL LETTER O WITH ACUTE */
149+
{ 0xC3, 0x94, 0x00 }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
150+
{ 0xC3, 0x95, 0x00 }, /* LATIN CAPITAL LETTER O WITH TILDE */
151+
{ 0xC3, 0x96, 0x00 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
152+
{ 0xC3, 0x98, 0x00 }, /* LATIN CAPITAL LETTER O WITH STROKE */
153+
{ 0xC3, 0x99, 0x00 }, /* LATIN CAPITAL LETTER U WITH GRAVE */
154+
{ 0xC3, 0x9A, 0x00 }, /* LATIN CAPITAL LETTER U WITH ACUTE */
155+
{ 0xC3, 0x9B, 0x00 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
156+
{ 0xC3, 0x9C, 0x00 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
157+
{ 0xC3, 0x9F, 0x00 }, /* LATIN SMALL LETTER SHARP S */
158+
{ 0xC3, 0xA0, 0x00 }, /* LATIN SMALL LETTER A WITH GRAVE */
159+
{ 0xC3, 0xA1, 0x00 }, /* LATIN SMALL LETTER A WITH ACUTE */
160+
{ 0xC3, 0xA2, 0x00 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
161+
{ 0xC3, 0xA3, 0x00 }, /* LATIN SMALL LETTER A WITH TILDE */
162+
{ 0xC3, 0xA4, 0x00 }, /* LATIN SMALL LETTER A WITH DIAERESIS */
163+
{ 0xC3, 0xA5, 0x00 }, /* LATIN SMALL LETTER A WITH RING ABOVE */
164+
{ 0xC3, 0xA6, 0x00 }, /* LATIN SMALL LIGATURE AE */
165+
{ 0xC3, 0xA7, 0x00 }, /* LATIN SMALL LETTER C WITH CEDILLA */
166+
{ 0xC3, 0xA8, 0x00 }, /* LATIN SMALL LETTER E WITH GRAVE */
167+
{ 0xC3, 0xA9, 0x00 }, /* LATIN SMALL LETTER E WITH ACUTE */
168+
{ 0xC3, 0xAA, 0x00 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
169+
{ 0xC3, 0xAB, 0x00 }, /* LATIN SMALL LETTER E WITH DIAERESIS */
170+
{ 0xC3, 0xAC, 0x00 }, /* LATIN SMALL LETTER I WITH GRAVE */
171+
{ 0xC3, 0xAD, 0x00 }, /* LATIN SMALL LETTER I WITH ACUTE */
172+
{ 0xC3, 0xAE, 0x00 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
173+
{ 0xC3, 0xAF, 0x00 }, /* LATIN SMALL LETTER I WITH DIAERESIS */
174+
{ 0xC3, 0xB1, 0x00 }, /* LATIN SMALL LETTER N WITH TILDE */
175+
{ 0xC3, 0xB2, 0x00 }, /* LATIN SMALL LETTER O WITH GRAVE */
176+
{ 0xC3, 0xB3, 0x00 }, /* LATIN SMALL LETTER O WITH ACUTE */
177+
{ 0xC3, 0xB4, 0x00 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
178+
{ 0xC3, 0xB5, 0x00 }, /* LATIN SMALL LETTER O WITH TILDE */
179+
{ 0xC3, 0xB6, 0x00 }, /* LATIN SMALL LETTER O WITH DIAERESIS */
180+
{ 0xC3, 0xB7, 0x00 }, /* DIVISION SIGN */
181+
{ 0xC3, 0xB8, 0x00 }, /* LATIN SMALL LETTER O WITH STROKE */
182+
{ 0xC3, 0xB9, 0x00 }, /* LATIN SMALL LETTER U WITH GRAVE */
183+
{ 0xC3, 0xBA, 0x00 }, /* LATIN SMALL LETTER U WITH ACUTE */
184+
{ 0xC3, 0xBB, 0x00 }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
185+
{ 0xC3, 0xBC, 0x00 }, /* LATIN SMALL LETTER U WITH DIAERESIS */
186+
{ 0xC3, 0xBF, 0x00 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */
187+
{ 0xC4, 0xB1, 0x00 }, /* LATIN SMALL LETTER DOTLESS I */
188+
{ 0xC5, 0x92, 0x00 }, /* LATIN CAPITAL LIGATURE OE */
189+
{ 0xC5, 0x93, 0x00 }, /* LATIN SMALL LIGATURE OE */
190+
{ 0xC5, 0xB8, 0x00 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
191+
{ 0xC6, 0x92, 0x00 }, /* LATIN SMALL LETTER F WITH HOOK */
192+
{ 0xCB, 0x86, 0x00 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
193+
{ 0xCB, 0x87, 0x00 }, /* CARON */
194+
{ 0xCB, 0x98, 0x00 }, /* BREVE */
195+
{ 0xCB, 0x99, 0x00 }, /* DOT ABOVE */
196+
{ 0xCB, 0x9A, 0x00 }, /* RING ABOVE */
197+
{ 0xCB, 0x9B, 0x00 }, /* OGONEK */
198+
{ 0xCB, 0x9C, 0x00 }, /* SMALL TILDE */
199+
{ 0xCB, 0x9D, 0x00 }, /* DOUBLE ACUTE ACCENT */
200+
{ 0xCE, 0xA9, 0x00 }, /* OHM SIGN (Canonical ?) */
201+
{ 0xCF, 0x80, 0x00 }, /* GREEK SMALL LETTER PI */
202+
{ 0xE2, 0x80, 0x93 }, /* EN DASH */
203+
{ 0xE2, 0x80, 0x94 }, /* EM DASH */
204+
{ 0xE2, 0x80, 0x98 }, /* LEFT SINGLE QUOTATION MARK */
205+
{ 0xE2, 0x80, 0x99 }, /* RIGHT SINGLE QUOTATION MARK */
206+
{ 0xE2, 0x80, 0x9A }, /* SINGLE LOW-9 QUOTATION MARK */
207+
{ 0xE2, 0x80, 0x9C }, /* LEFT DOUBLE QUOTATION MARK */
208+
{ 0xE2, 0x80, 0x9D }, /* RIGHT DOUBLE QUOTATION MARK */
209+
{ 0xE2, 0x80, 0x9E }, /* DOUBLE LOW-9 QUOTATION MARK */
210+
{ 0xE2, 0x80, 0xA0 }, /* DAGGER */
211+
{ 0xE2, 0x80, 0xA1 }, /* DOUBLE DAGGER */
212+
{ 0xE2, 0x80, 0xA2 }, /* BULLET */
213+
{ 0xE2, 0x80, 0xA6 }, /* HORIZONTAL ELLIPSIS */
214+
{ 0xE2, 0x80, 0xB0 }, /* PER MILLE SIGN */
215+
{ 0xE2, 0x80, 0xB9 }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
216+
{ 0xE2, 0x80, 0xBA }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
217+
{ 0xE2, 0x81, 0x84 }, /* FRACTION SLASH */
218+
{ 0xE2, 0x82, 0xAC }, /* EURO SIGN */
219+
{ 0xE2, 0x84, 0xA2 }, /* TRADE MARK SIGN */
220+
{ 0xE2, 0x84, 0xA6 }, /* OHM SIGN */
221+
{ 0xE2, 0x88, 0x82 }, /* PARTIAL DIFFERENTIAL */
222+
{ 0xE2, 0x88, 0x86 }, /* INCREMENT */
223+
{ 0xE2, 0x88, 0x8F }, /* N-ARY PRODUCT */
224+
{ 0xE2, 0x88, 0x91 }, /* N-ARY SUMMATION */
225+
{ 0xE2, 0x88, 0x9A }, /* SQUARE ROOT */
226+
{ 0xE2, 0x88, 0x9E }, /* INFINITY */
227+
{ 0xE2, 0x88, 0xAB }, /* INTEGRAL */
228+
{ 0xE2, 0x89, 0x88 }, /* ALMOST EQUAL TO */
229+
{ 0xE2, 0x89, 0xA0 }, /* NOT EQUAL TO */
230+
{ 0xE2, 0x89, 0xA4 }, /* LESS-THAN OR EQUAL TO */
231+
{ 0xE2, 0x89, 0xA5 }, /* GREATER-THAN OR EQUAL TO */
232+
{ 0xE2, 0x97, 0x8A }, /* LOZENGE */
233+
{ 0xEF, 0xA3, 0xBF }, /* Apple logo */
234+
{ 0xEF, 0xAC, 0x81 }, /* LATIN SMALL LIGATURE FI */
235+
{ 0xEF, 0xAC, 0x82 }, /* LATIN SMALL LIGATURE FL */
236+
};

0 commit comments

Comments
 (0)