Skip to content

Commit f700688

Browse files
committed
[se-0405] adapt implementation from staging package
1 parent 15a6c01 commit f700688

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed

stdlib/public/core/String.swift

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,114 @@ extension String {
493493
self = String._fromNonContiguousUnsafeBitcastUTF8Repairing(codeUnits).0
494494
}
495495

496+
/// Creates a new `String` by copying and validating the sequence of
497+
/// code units passed in, according to the specified encoding.
498+
///
499+
/// This initializer does not try to repair ill-formed code unit sequences.
500+
/// If any are found, the result of the initializer is `nil`.
501+
///
502+
/// The following example calls this initializer with the contents of two
503+
/// different arrays---first with a well-formed UTF-8 code unit sequence and
504+
/// then with an ill-formed UTF-16 code unit sequence.
505+
///
506+
/// let validUTF8: [UInt8] = [67, 97, 0, 102, 195, 169]
507+
/// let valid = String(validating: validUTF8, as: UTF8.self)
508+
/// print(valid)
509+
/// // Prints "Optional("Café")"
510+
///
511+
/// let invalidUTF16: [UInt16] = [0x41, 0x42, 0xd801]
512+
/// let invalid = String(validating: invalidUTF16, as: UTF16.self)
513+
/// print(invalid)
514+
/// // Prints "nil"
515+
///
516+
/// - Parameters:
517+
/// - codeUnits: A sequence of code units that encode a `String`
518+
/// - encoding: A conformer to `Unicode.Encoding` to be used
519+
/// to decode `codeUnits`.
520+
@inlinable
521+
@available(SwiftStdlib 5.10, *)
522+
public init?<Encoding: Unicode.Encoding>(
523+
validating codeUnits: some Sequence<Encoding.CodeUnit>,
524+
as encoding: Encoding.Type
525+
) {
526+
let newString: String?? = codeUnits.withContiguousStorageIfAvailable {
527+
String._validate($0, as: Encoding.self)
528+
}
529+
if let newString {
530+
guard let newString else { return nil }
531+
self = newString
532+
return
533+
}
534+
535+
// slow-path
536+
var transcoded: [UTF8.CodeUnit] = []
537+
transcoded.reserveCapacity(codeUnits.underestimatedCount)
538+
var isASCII = true
539+
let error = transcode(
540+
codeUnits.makeIterator(),
541+
from: Encoding.self,
542+
to: UTF8.self,
543+
stoppingOnError: true,
544+
into: {
545+
uint8 in
546+
transcoded.append(uint8)
547+
if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false }
548+
}
549+
)
550+
if error { return nil }
551+
self = transcoded.withUnsafeBufferPointer{
552+
String._uncheckedFromUTF8($0, asciiPreScanResult: isASCII)
553+
}
554+
}
555+
556+
/// Creates a new `String` by copying and validating the sequence of
557+
/// `Int8` passed in, according to the specified encoding.
558+
///
559+
/// This initializer does not try to repair ill-formed code unit sequences.
560+
/// If any are found, the result of the initializer is `nil`.
561+
///
562+
/// The following example calls this initializer with the contents of two
563+
/// different arrays---first with a well-formed UTF-8 code unit sequence and
564+
/// then with an ill-formed ASCII code unit sequence.
565+
///
566+
/// let validUTF8: [Int8] = [67, 97, 0, 102, -61, -87]
567+
/// let valid = String(validating: validUTF8, as: UTF8.self)
568+
/// print(valid)
569+
/// // Prints "Optional("Café")"
570+
///
571+
/// let invalidASCII: [Int8] = [67, 97, -5]
572+
/// let invalid = String(validating: invalidASCII, as: Unicode.ASCII.self)
573+
/// print(invalid)
574+
/// // Prints "nil"
575+
///
576+
/// - Parameters:
577+
/// - codeUnits: A sequence of code units that encode a `String`
578+
/// - encoding: A conformer to `Unicode.Encoding` that can decode
579+
/// `codeUnits` as `UInt8`
580+
@inlinable
581+
@available(SwiftStdlib 5.10, *)
582+
public init?<Encoding>(
583+
validating codeUnits: some Sequence<Int8>,
584+
as encoding: Encoding.Type
585+
) where Encoding: Unicode.Encoding, Encoding.CodeUnit == UInt8 {
586+
let newString: String?? = codeUnits.withContiguousStorageIfAvailable {
587+
$0.withMemoryRebound(to: UInt8.self) {
588+
String._validate($0, as: Encoding.self)
589+
}
590+
}
591+
if let newString {
592+
guard let newString else { return nil }
593+
self = newString
594+
return
595+
}
596+
597+
// slow-path
598+
let uint8s = codeUnits.lazy.map(UInt8.init(bitPattern:))
599+
let string = String(validating: uint8s, as: Encoding.self)
600+
guard let string else { return nil }
601+
self = string
602+
}
603+
496604
/// Creates a new string with the specified capacity in UTF-8 code units, and
497605
/// then calls the given closure with a buffer covering the string's
498606
/// uninitialized memory.

stdlib/public/core/StringCreate.swift

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,4 +298,53 @@ extension String {
298298
String._uncheckedFromUTF8($0)
299299
}
300300
}
301+
302+
@usableFromInline
303+
@available(SwiftStdlib 5.10, *)
304+
internal static func _validate<Encoding: Unicode.Encoding>(
305+
_ input: UnsafeBufferPointer<Encoding.CodeUnit>,
306+
as encoding: Encoding.Type
307+
) -> String? {
308+
fast: // fast-path
309+
if encoding.CodeUnit.self == UInt8.self {
310+
let bytes = _identityCast(input, to: UnsafeBufferPointer<UInt8>.self)
311+
let isASCII: Bool
312+
if encoding.self == UTF8.self {
313+
guard case .success(let info) = validateUTF8(bytes) else { return nil }
314+
isASCII = info.isASCII
315+
} else if encoding.self == Unicode.ASCII.self {
316+
guard _allASCII(bytes) else { return nil }
317+
isASCII = true
318+
} else {
319+
break fast
320+
}
321+
return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII)
322+
}
323+
324+
// slow-path
325+
// this multiplier is a worst-case estimate
326+
let multiplier = if encoding.self == UTF16.self { 3 } else { 4 }
327+
return withUnsafeTemporaryAllocation(
328+
of: UInt8.self, capacity: input.count * multiplier
329+
) {
330+
output -> String? in
331+
var isASCII = true
332+
var index = output.startIndex
333+
let error = transcode(
334+
input.makeIterator(),
335+
from: encoding.self,
336+
to: UTF8.self,
337+
stoppingOnError: true,
338+
into: {
339+
uint8 in
340+
output[index] = uint8
341+
output.formIndex(after: &index)
342+
if isASCII && (uint8 & 0x80) == 0x80 { isASCII = false }
343+
}
344+
)
345+
if error { return nil }
346+
let bytes = UnsafeBufferPointer(start: output.baseAddress, count: index)
347+
return String._uncheckedFromUTF8(bytes, asciiPreScanResult: isASCII)
348+
}
349+
}
301350
}

0 commit comments

Comments
 (0)