Skip to content

Commit d4026cc

Browse files
authored
Merge pull request #371 from macdrevx/fix-nbsp-escape-fast-path
Fix nbsp not escaped when it's the only special character
2 parents 6baf7e8 + fc78ef0 commit d4026cc

File tree

2 files changed

+91
-2
lines changed

2 files changed

+91
-2
lines changed

Sources/Entities.swift

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,12 @@ public final class Entities: Sendable {
601601
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
602602
return true
603603
}
604+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
605+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
606+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
607+
return true
608+
}
609+
}
604610
if inAttribute {
605611
if escapeMode == .xhtml,
606612
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {
@@ -634,6 +640,12 @@ public final class Entities: Sendable {
634640
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
635641
return true
636642
}
643+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
644+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
645+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
646+
return true
647+
}
648+
}
637649
if inAttribute {
638650
if escapeMode == .xhtml,
639651
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {
@@ -662,11 +674,18 @@ public final class Entities: Sendable {
662674
count > 0 {
663675
var needsEscape = false
664676
var sawWhitespace = false
665-
for b in string {
677+
for i in string.indices {
678+
let b = string[i]
666679
if encoderIsAscii && b >= asciiUpperLimitByte {
667680
needsEscape = true
668681
break
669682
}
683+
if b == StringUtil.utf8NBSPLead,
684+
i + 1 < string.endIndex,
685+
string[i + 1] == StringUtil.utf8NBSPTrail {
686+
needsEscape = true
687+
break
688+
}
670689
if normaliseWhite && b.isWhitespace {
671690
sawWhitespace = true
672691
break
@@ -843,6 +862,12 @@ public final class Entities: Sendable {
843862
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
844863
return true
845864
}
865+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
866+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
867+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
868+
return true
869+
}
870+
}
846871
if inAttribute {
847872
if escapeMode == .xhtml,
848873
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {
@@ -876,6 +901,12 @@ public final class Entities: Sendable {
876901
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
877902
return true
878903
}
904+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
905+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
906+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
907+
return true
908+
}
909+
}
879910
if inAttribute {
880911
if escapeMode == .xhtml,
881912
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {
@@ -904,11 +935,18 @@ public final class Entities: Sendable {
904935
count > 0 {
905936
var needsEscape = false
906937
var sawWhitespace = false
907-
for b in string {
938+
for i in string.indices {
939+
let b = string[i]
908940
if encoderIsAscii && b >= asciiUpperLimitByte {
909941
needsEscape = true
910942
break
911943
}
944+
if b == StringUtil.utf8NBSPLead,
945+
string.index(after: i) < string.endIndex,
946+
string[string.index(after: i)] == StringUtil.utf8NBSPTrail {
947+
needsEscape = true
948+
break
949+
}
912950
if normaliseWhite && b.isWhitespace {
913951
sawWhitespace = true
914952
break
@@ -1084,6 +1122,12 @@ public final class Entities: Sendable {
10841122
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
10851123
return true
10861124
}
1125+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
1126+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
1127+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
1128+
return true
1129+
}
1130+
}
10871131
if inAttribute {
10881132
if escapeMode == .xhtml,
10891133
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {
@@ -1117,6 +1161,12 @@ public final class Entities: Sendable {
11171161
if memchr(base, Int32(TokeniserStateVars.ampersandByte), len) != nil {
11181162
return true
11191163
}
1164+
if let nbspLead = memchr(base, Int32(StringUtil.utf8NBSPLead), len) {
1165+
let idx = base.distance(to: nbspLead.assumingMemoryBound(to: UInt8.self))
1166+
if idx + 1 < len, base[idx + 1] == StringUtil.utf8NBSPTrail {
1167+
return true
1168+
}
1169+
}
11201170
if inAttribute {
11211171
if escapeMode == .xhtml,
11221172
memchr(base, Int32(TokeniserStateVars.lessThanByte), len) != nil {

Tests/SwiftSoupTests/EntitiesTest.swift

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,4 +141,43 @@ class EntitiesTest: XCTestCase {
141141
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml)
142142
XCTAssertEqual("<a title=\"&lt;p>One&lt;/p>\">One</a>", try element.outerHtml())
143143
}
144+
145+
func testNbspEscapedWhenOnlySpecialChar() {
146+
// When nbsp is the only special character present, the fast-path needsEscape
147+
// check must still detect it and escape it (not output raw U+00A0 bytes).
148+
let text = "hello\u{A0}world"
149+
150+
// Default: UTF-8, extended mode → &nbsp;
151+
XCTAssertEqual("hello&nbsp;world", Entities.escape(text))
152+
153+
// Base mode, UTF-8 → &nbsp;
154+
let base = OutputSettings().charset(.utf8).escapeMode(Entities.EscapeMode.base)
155+
XCTAssertEqual("hello&nbsp;world", Entities.escape(text, base))
156+
157+
// XHTML mode, UTF-8 → &#xa0;
158+
let xhtml = OutputSettings().charset(.utf8).escapeMode(Entities.EscapeMode.xhtml)
159+
XCTAssertEqual("hello&#xa0;world", Entities.escape(text, xhtml))
160+
161+
// ASCII charset → should also escape
162+
let ascii = OutputSettings().charset(.ascii).escapeMode(Entities.EscapeMode.base)
163+
XCTAssertEqual("hello&nbsp;world", Entities.escape(text, ascii))
164+
}
165+
166+
func testNbspPreservedThroughParseAndSerialize() throws {
167+
// Round-trip: parse HTML containing &nbsp;, then serialize back.
168+
// The output must contain the &nbsp; entity, not raw U+00A0 bytes.
169+
let html = "<p>hello&nbsp;world</p>"
170+
let doc = try SwiftSoup.parse(html)
171+
let p = try doc.select("p").first()!
172+
let output = try p.html()
173+
174+
XCTAssertEqual("hello&nbsp;world", output)
175+
XCTAssertFalse(output.contains("\u{A0}"), "Output should not contain raw U+00A0")
176+
}
177+
178+
func testMultipleNbspEscaped() {
179+
// Multiple nbsp characters, no other special chars
180+
let text = "a\u{A0}b\u{A0}c"
181+
XCTAssertEqual("a&nbsp;b&nbsp;c", Entities.escape(text))
182+
}
144183
}

0 commit comments

Comments
 (0)