Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Sources/FoundationEssentials/String/String+IO.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ dynamic public func _cfMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>,
// Provide swift-corelibs-foundation with an entry point to convert some bytes into a String
return nil
}

dynamic package func _icuMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
Expand Down Expand Up @@ -184,6 +189,17 @@ extension String {
}
}
self = bytes.withContiguousStorageIfAvailable(buildString) ?? Array(bytes).withUnsafeBufferPointer(buildString)
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
guard let string = (
bytes.withContiguousStorageIfAvailable({ _icuMakeStringFromBytes($0, encoding: encoding) }) ??
Array(bytes).withUnsafeBufferPointer({ _icuMakeStringFromBytes($0, encoding: encoding) })
) else {
return nil
}
self = string
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ dynamic public func _cfStringEncodingConvert(string: String, using encoding: UIn
// Dynamically replaced by swift-corelibs-foundation to implement encodings that we do not have Swift replacements for, yet
return nil
}

dynamic package func _icuStringEncodingConvert(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(FoundationPreview 0.4, *)
Expand Down Expand Up @@ -249,6 +254,11 @@ extension String {
buffer.appendElement(value)
}
}
case .japaneseEUC:
// Here we catch encodings that are supported by Foundation Framework
// but are not supported by corelibs-foundation.
// We delegate conversion to ICU.
return _icuStringEncodingConvert(string: self, using: encoding, allowLossyConversion: allowLossyConversion)
#endif
default:
#if FOUNDATION_FRAMEWORK
Expand Down
206 changes: 206 additions & 0 deletions Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if canImport(FoundationEssentials)
import FoundationEssentials
#endif
internal import _FoundationICU

private extension String.Encoding {
var _icuConverterName: String? {
// TODO: Replace this with forthcoming(?) public property such as https://github.com/swiftlang/swift-foundation/pull/1243
// Note: UTF-* and US-ASCII are omitted here because they are supposed to be converted upstream.
switch self {
case .japaneseEUC: "EUC-JP"
case .isoLatin1: "ISO-8859-1"
case .shiftJIS: "Shift_JIS"
case .isoLatin2: "ISO-8859-2"
case .windowsCP1251: "windows-1251"
case .windowsCP1252: "windows-1252"
case .windowsCP1253: "windows-1253"
case .windowsCP1254: "windows-1254"
case .windowsCP1250: "windows-1250"
case .iso2022JP: "ISO-2022-JP"
case .macOSRoman: "macintosh"
default: nil
}
}
}

extension ICU {
final class StringConverter: @unchecked Sendable {
private let _converter: LockedState<OpaquePointer> // UConverter*

let encoding: String.Encoding

init?(encoding: String.Encoding) {
guard let convName = encoding._icuConverterName else {
return nil
}
var status: UErrorCode = U_ZERO_ERROR
guard let converter = ucnv_open(convName, &status), status.isSuccess else {
return nil
}
self._converter = LockedState(initialState: converter)
self.encoding = encoding
}

deinit {
_converter.withLock { ucnv_close($0) }
}
}
}

extension ICU.StringConverter {
func decode(data: Data) -> String? {
return _converter.withLock { converter in
defer {
ucnv_resetToUnicode(converter)
}

let srcLength = CInt(data.count)
let initCapacity = srcLength * CInt(ucnv_getMinCharSize(converter)) + 1
return _withResizingUCharBuffer(initialSize: initCapacity) { (dest, capacity, status) in
return data.withUnsafeBytes { src in
ucnv_toUChars(
converter,
dest,
capacity,
src.baseAddress,
srcLength,
&status
)
}
}
}
}

func encode(string: String, allowLossyConversion lossy: Bool) -> Data? {
return _converter.withLock { (converter) -> Data? in
defer {
ucnv_resetFromUnicode(converter)
}

let utf16Rep = string.utf16
let uchars = UnsafeMutableBufferPointer<UChar>.allocate(capacity: utf16Rep.count)
_ = uchars.initialize(fromContentsOf: utf16Rep)
defer {
uchars.deallocate()
}

let srcLength = uchars.count
let capacity = srcLength * Int(ucnv_getMaxCharSize(converter)) + 1
let dest = UnsafeMutableRawPointer.allocate(
byteCount: capacity,
alignment: MemoryLayout<CChar>.alignment
)

var status: UErrorCode = U_ZERO_ERROR
if lossy {
var lossyChar: UChar = encoding == .ascii ? 0xFF : 0x3F
ucnv_setSubstString(
converter,
&lossyChar,
1,
&status
)
guard status.isSuccess else { return nil }

ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
} else {
ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_STOP,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
}

let actualLength = ucnv_fromUChars(
converter,
dest,
CInt(capacity),
uchars.baseAddress,
CInt(srcLength),
&status
)
guard status.isSuccess else { return nil }
return Data(
bytesNoCopy: dest,
count: Int(actualLength),
deallocator: .custom({ pointer, _ in pointer.deallocate() })
)
}
}
}

extension ICU.StringConverter {
private static let _converters: LockedState<[String.Encoding: ICU.StringConverter]> = .init(initialState: [:])

static func converter(for encoding: String.Encoding) -> ICU.StringConverter? {
return _converters.withLock {
if let converter = $0[encoding] {
return converter
}
if let converter = ICU.StringConverter(encoding: encoding) {
$0[encoding] = converter
return converter
}
return nil
}
}
}


#if !FOUNDATION_FRAMEWORK
@_dynamicReplacement(for: _icuMakeStringFromBytes(_:encoding:))
func _icuMakeStringFromBytes_impl(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
guard let converter = ICU.StringConverter.converter(for: encoding),
let pointer = bytes.baseAddress else {
return nil
}

// Since we want to avoid unnecessary copy here,
// `bytes` is converted to `UnsafeMutableRawPointer`
// because `Data(bytesNoCopy:count:deallocator:)` accepts only that type.
// This operation is still safe,
// as the pointer is just borrowed (not escaped, not mutated)
// in `ICU.StringConverter.decode(data:) -> String?`.
// In addition to that, `Data` is useful here
// because it is `Sendable` (and has CoW behavior).
let data = Data(
bytesNoCopy: UnsafeMutableRawPointer(mutating: pointer),
count: bytes.count,
deallocator: .none
)
return converter.decode(data: data)
}

@_dynamicReplacement(for: _icuStringEncodingConvert(string:using:allowLossyConversion:))
func _icuStringEncodingConvert_impl(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
guard let converter = ICU.StringConverter.converter(for: encoding) else {
return nil
}
return converter.encode(string: string, allowLossyConversion: allowLossyConversion)
}
#endif
126 changes: 126 additions & 0 deletions Tests/FoundationInternationalizationTests/StringTests+Data.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if FOUNDATION_FRAMEWORK
@testable import Foundation
#else
@testable import FoundationEssentials
@testable import FoundationInternationalization
#endif // FOUNDATION_FRAMEWORK

#if canImport(TestSupport)
import TestSupport
#endif

final class StringConverterTests: XCTestCase {
private func _test_roundTripConversion(
string: String,
data: Data,
encoding: String._Encoding,
file: StaticString = #filePath,
line: UInt = #line
) {
XCTAssertEqual(
string.data(using: encoding), data, "Failed to convert string to data.",
file: file, line: line
)
XCTAssertEqual(
string, String(data: data, encoding: encoding), "Failed to convert data to string.",
file: file, line: line
)
}

func test_japaneseEUC() {
// Confirm that https://github.com/swiftlang/swift-foundation/issues/1016 is fixed.

// ASCII
_test_roundTripConversion(
string: "ABC",
data: Data([0x41, 0x42, 0x43]),
encoding: .japaneseEUC
)

// Plane 1 Row 1
_test_roundTripConversion(
string: "、。◇",
data: Data([
0xA1, 0xA2,
0xA1, 0xA3,
0xA1, 0xFE,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 4 (Hiragana)
_test_roundTripConversion(
string: "ひらがな",
data: Data([
0xA4, 0xD2,
0xA4, 0xE9,
0xA4, 0xAC,
0xA4, 0xCA,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 5 (Katakana)
_test_roundTripConversion(
string: "ヴヵヶ",
data: Data([
0xA5, 0xF4,
0xA5, 0xF5,
0xA5, 0xF6,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 6 (Greek Alphabets)
_test_roundTripConversion(
string: "Σπ",
data: Data([
0xA6, 0xB2,
0xA6, 0xD0,
]),
encoding: .japaneseEUC
)

// Basic Kanji
_test_roundTripConversion(
string: "日本",
data: Data([
0xC6, 0xFC,
0xCB, 0xDC,
]),
encoding: .japaneseEUC
)

// Amendment by JIS83/JIS90
_test_roundTripConversion(
string: "扉⇔穴",
data: Data([
0xC8, 0xE2,
0xA2, 0xCE,
0xB7, 0xEA,
]),
encoding: .japaneseEUC
)

// Unsupported characters
let sushi = "Sushi🍣"
XCTAssertNil(sushi.data(using: String._Encoding.japaneseEUC))
XCTAssertEqual(
sushi.data(using: String._Encoding.japaneseEUC, allowLossyConversion: true),
"Sushi?".data(using: .utf8)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test unfortunately fails in the FOUNDATION_FRAMEWORK configuration because Foundation.framework's implementation of this conversion replaces both UTF-16 scalars of the emoji with ? rather than replacing the entire grapheme cluster with a single ?. I suspect that maybe ICU is replacing only with a single ? - do you happen to know if that's intentional? If so, for now we might need to vary the expected value to be Sushi?? in FOUNDATION_FRAMEWORK vs. Sushi? in !FOUNDATION_FRAMEWORK to account for the discrepancy between Foundation.framework and ICU (assuming we think this result from ICU is indeed correct and not a bug in how we're calling ICU)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for my overlooking again, but I'm not sure if we should implement conversion just like Foundation framework.
I mean the behavior of Foundation framework looks buggy because it doesn't seem to handle surrogate pairs correctly. (🍣 is non-BMP but consists of only one scalar U+1F363. Does that "buggy" behavior come from the historical reason that NSString has been represented as a sequence of UTF–16??)
Of course, we can imitate such behavior by replacing UCNV_FROM_U_CALLBACK_SUBSTITUTE with a custom callback function.
However, let me point out that current swift-foundation's implementation for .ascii is code-point-basis, not UTF-16-value-basis:

import Foundation

let sushiEmoji = "🍣"
print(sushiEmoji.data(using: .ascii, allowLossyConversion: true)!.count)
// -> Prints "1"

Options...?

  • To keep consistent with current (e.g.).ascii implementation:
    • Divide the test case for FOUNDATION_FRAMEWORK and !FOUNDATION_FRAMEWORK.
  • To make it compatible with Foundation framework's behavior:
    • Implement a custom callback for ucnv_setFromUCallBack.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I suspect this behavior comes from the fact that NSString is natively stored as UTF-16. For now, let's just divide the test case with a FOUNDATION_FRAMEWORK/!FOUNDATION_FRAMEWORK value for this expectation since I think enabling the conversion at all on non-Darwin is a good step even if the behavior is slightly different than Foundation.framework, and I can come back with a separate change to look into making this behavior the same across both platforms (while ensuring we don't break compatibility with existing clients).

)
}
}

Loading