Skip to content

Commit c7dc36b

Browse files
kylehowellsclaude
andcommitted
Add configurable DoS protection limits
Add ParserLimits struct with configurable limits to prevent denial-of-service attacks from pathological HTML input: - maxEntityNameLength (default: 255): Limits entity name collection to prevent memory allocation attacks with inputs like &aaaa... - maxNestingDepth (default: 512): Limits DOM nesting depth to prevent stack overflow on deeply nested input (10,000+ levels) The limits are configurable via the new `limits` parameter on JustHTML initializers, with presets for .default, .strict, and .unlimited. Includes comprehensive test suite (DoSProtectionTests.swift) with 31 tests covering entity limits, nesting limits, and combined attack vectors. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 495acb2 commit c7dc36b

File tree

5 files changed

+705
-11
lines changed

5 files changed

+705
-11
lines changed

Sources/swift-justhtml/JustHTML.swift

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public struct JustHTML {
2525
/// - scripting: Whether scripting is enabled
2626
/// - iframeSrcdoc: Whether parsing iframe srcdoc content
2727
/// - xmlCoercion: Whether to coerce output for XML compatibility
28+
/// - limits: Parser limits for DoS protection (defaults to sensible limits)
2829
/// - Throws: StrictModeError if strict mode is enabled and a parse error occurs
2930
public init(
3031
_ html: String,
@@ -33,7 +34,8 @@ public struct JustHTML {
3334
strict: Bool = false,
3435
scripting: Bool = false,
3536
iframeSrcdoc: Bool = false,
36-
xmlCoercion: Bool = false
37+
xmlCoercion: Bool = false,
38+
limits: ParserLimits = .default
3739
) throws {
3840
self.fragmentContext = fragmentContext
3941
self.encoding = nil
@@ -44,14 +46,16 @@ public struct JustHTML {
4446
fragmentContext: fragmentContext,
4547
iframeSrcdoc: iframeSrcdoc,
4648
collectErrors: shouldCollect,
47-
scripting: scripting
49+
scripting: scripting,
50+
maxNestingDepth: limits.maxNestingDepth
4851
)
4952

50-
let opts = Self.tokenizerOpts(
53+
var opts = Self.tokenizerOpts(
5154
fragmentContext: fragmentContext,
5255
xmlCoercion: xmlCoercion,
5356
scripting: scripting
5457
)
58+
opts.maxEntityNameLength = limits.maxEntityNameLength
5559

5660
let tokenizer = Tokenizer(treeBuilder, opts: opts, collectErrors: shouldCollect)
5761
treeBuilder.tokenizer = tokenizer
@@ -76,6 +80,7 @@ public struct JustHTML {
7680
/// - scripting: Whether scripting is enabled
7781
/// - iframeSrcdoc: Whether parsing iframe srcdoc content
7882
/// - xmlCoercion: Whether to coerce output for XML compatibility
83+
/// - limits: Parser limits for DoS protection (defaults to sensible limits)
7984
/// - Throws: StrictModeError if strict mode is enabled and a parse error occurs
8085
public init(
8186
data: Data,
@@ -85,7 +90,8 @@ public struct JustHTML {
8590
strict: Bool = false,
8691
scripting: Bool = false,
8792
iframeSrcdoc: Bool = false,
88-
xmlCoercion: Bool = false
93+
xmlCoercion: Bool = false,
94+
limits: ParserLimits = .default
8995
) throws {
9096
let (html, detectedEncoding) = decodeHTML(data, transportEncoding: transportEncoding)
9197

@@ -98,14 +104,16 @@ public struct JustHTML {
98104
fragmentContext: fragmentContext,
99105
iframeSrcdoc: iframeSrcdoc,
100106
collectErrors: shouldCollect,
101-
scripting: scripting
107+
scripting: scripting,
108+
maxNestingDepth: limits.maxNestingDepth
102109
)
103110

104-
let opts = Self.tokenizerOpts(
111+
var opts = Self.tokenizerOpts(
105112
fragmentContext: fragmentContext,
106113
xmlCoercion: xmlCoercion,
107114
scripting: scripting
108115
)
116+
opts.maxEntityNameLength = limits.maxEntityNameLength
109117

110118
let tokenizer = Tokenizer(treeBuilder, opts: opts, collectErrors: shouldCollect)
111119
treeBuilder.tokenizer = tokenizer
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// ParserLimits.swift - Configurable limits for DoS protection
2+
//
3+
// These limits prevent pathological inputs from causing crashes or
4+
// excessive resource consumption. The defaults are set high enough
5+
// that no real-world HTML document should ever hit them.
6+
7+
import Foundation
8+
9+
/// Configurable limits for the HTML parser
10+
///
11+
/// These limits protect against denial-of-service attacks from malicious
12+
/// or pathological HTML input. The default values are conservative and
13+
/// should never be reached by legitimate web content.
14+
///
15+
/// Example usage:
16+
/// ```swift
17+
/// // Use default limits (recommended for most use cases)
18+
/// let doc = try JustHTML(html)
19+
///
20+
/// // Use larger limits for server with lots of RAM
21+
/// var limits = ParserLimits()
22+
/// limits.maxNestingDepth = 2048
23+
/// let doc = try JustHTML(html, limits: limits)
24+
///
25+
/// // Disable limits entirely (not recommended)
26+
/// let doc = try JustHTML(html, limits: .unlimited)
27+
/// ```
28+
public struct ParserLimits: Sendable {
29+
/// Maximum length for named character reference entity names.
30+
///
31+
/// The longest valid HTML entity is ~31 characters (e.g., "CounterClockwiseContourIntegral").
32+
/// Setting this limit prevents the tokenizer from allocating huge strings when
33+
/// parsing malicious input like `&aaaa...` with millions of characters.
34+
///
35+
/// Default: 255 characters
36+
public var maxEntityNameLength: Int
37+
38+
/// Maximum depth of nested elements in the DOM tree.
39+
///
40+
/// Real web pages rarely exceed 100-200 levels of nesting. Extremely deep
41+
/// nesting (10,000+ levels) can cause stack overflow crashes during tree
42+
/// construction or serialization.
43+
///
44+
/// When this limit is reached, additional elements are inserted as siblings
45+
/// rather than children, preserving the content while flattening the structure.
46+
///
47+
/// Default: 512 levels
48+
public var maxNestingDepth: Int
49+
50+
/// Create parser limits with default values.
51+
///
52+
/// Default limits:
53+
/// - `maxEntityNameLength`: 255 characters
54+
/// - `maxNestingDepth`: 512 levels
55+
public init(
56+
maxEntityNameLength: Int = 255,
57+
maxNestingDepth: Int = 512
58+
) {
59+
self.maxEntityNameLength = maxEntityNameLength
60+
self.maxNestingDepth = maxNestingDepth
61+
}
62+
63+
/// Default limits suitable for most applications.
64+
///
65+
/// These limits are conservative and should never be reached by
66+
/// legitimate web content.
67+
public static let `default` = ParserLimits()
68+
69+
/// Unlimited parsing (no DoS protection).
70+
///
71+
/// **Warning:** Using unlimited parsing on untrusted input may cause
72+
/// crashes or excessive memory/CPU usage. Only use this when parsing
73+
/// trusted content or when you have other safeguards in place.
74+
public static let unlimited = ParserLimits(
75+
maxEntityNameLength: Int.max,
76+
maxNestingDepth: Int.max
77+
)
78+
79+
/// Strict limits for resource-constrained environments (e.g., mobile devices).
80+
///
81+
/// These limits are more restrictive but should still handle all
82+
/// well-formed web content.
83+
public static let strict = ParserLimits(
84+
maxEntityNameLength: 128,
85+
maxNestingDepth: 256
86+
)
87+
}

Sources/swift-justhtml/Tokenizer.swift

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,19 +110,23 @@ public struct TokenizerOpts {
110110
public var xmlCoercion: Bool
111111
public var discardBom: Bool
112112
public var scripting: Bool
113+
/// Maximum length for named character reference entity names (DoS protection)
114+
public var maxEntityNameLength: Int
113115

114116
public init(
115117
initialState: Tokenizer.State = .data,
116118
initialRawtextTag: String? = nil,
117119
xmlCoercion: Bool = false,
118120
discardBom: Bool = false,
119-
scripting: Bool = false
121+
scripting: Bool = false,
122+
maxEntityNameLength: Int = ParserLimits.default.maxEntityNameLength
120123
) {
121124
self.initialState = initialState
122125
self.initialRawtextTag = initialRawtextTag
123126
self.xmlCoercion = xmlCoercion
124127
self.discardBom = discardBom
125128
self.scripting = scripting
129+
self.maxEntityNameLength = maxEntityNameLength
126130
}
127131
}
128132

@@ -3025,15 +3029,35 @@ public final class Tokenizer {
30253029
var matchedEntity: String? = nil
30263030
var matchedLength = 0
30273031
var consumed = 0
3032+
let maxLength = self.opts.maxEntityNameLength
3033+
var hitLimit = false
30283034

30293035
while let ch = peek() {
30303036
if ch.isASCIILetter || ch.isASCIIDigit {
3037+
// Check entity name length limit (DoS protection)
3038+
// The longest valid HTML entity is ~31 chars, so hitting this limit
3039+
// means the entity is definitely invalid - stop looking for matches
3040+
// but continue consuming to emit the full text
3041+
if consumed >= maxLength {
3042+
hitLimit = true
3043+
// Consume remaining alphanumeric characters and emit them as text
3044+
self.flushCharRefTempBuffer()
3045+
self.emitCharRefString(entityName)
3046+
// Emit remaining characters directly
3047+
while let next = peek(), next.isASCIILetter || next.isASCIIDigit {
3048+
self.emitChar(next)
3049+
_ = self.consume()
3050+
}
3051+
self.state = self.returnState
3052+
return
3053+
}
3054+
30313055
entityName.append(ch)
30323056
_ = self.consume()
30333057
consumed += 1
30343058

3035-
// Check for match
3036-
if let decoded = NAMED_ENTITIES[entityName] {
3059+
// Check for match (only if we haven't exceeded the limit)
3060+
if !hitLimit, let decoded = NAMED_ENTITIES[entityName] {
30373061
matchedEntity = decoded
30383062
matchedLength = consumed
30393063
}

Sources/swift-justhtml/TreeBuilder.swift

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ public final class TreeBuilder: TokenSink {
178178
public var errors: [ParseError] = []
179179
private var collectErrors: Bool
180180

181+
/// Maximum nesting depth (DoS protection)
182+
private let maxNestingDepth: Int
183+
181184
/// Reference to tokenizer for switching states
182185
public weak var tokenizer: Tokenizer? = nil
183186

@@ -192,12 +195,14 @@ public final class TreeBuilder: TokenSink {
192195
fragmentContext: FragmentContext? = nil,
193196
iframeSrcdoc: Bool = false,
194197
collectErrors: Bool = false,
195-
scripting: Bool = false
198+
scripting: Bool = false,
199+
maxNestingDepth: Int = ParserLimits.default.maxNestingDepth
196200
) {
197201
self.fragmentContext = fragmentContext
198202
self.iframeSrcdoc = iframeSrcdoc
199203
self.collectErrors = collectErrors
200204
self.scripting = scripting
205+
self.maxNestingDepth = maxNestingDepth
201206

202207
if fragmentContext != nil {
203208
self.document = Node(name: "#document-fragment")
@@ -2739,7 +2744,16 @@ public final class TreeBuilder: TokenSink {
27392744
{
27402745
let element = self.createElement(name: name, namespace: namespace, attrs: attrs)
27412746
self.insertNode(element)
2742-
self.openElements.append(element)
2747+
2748+
// DoS protection: limit nesting depth
2749+
// If we've hit the limit, don't push onto stack - element becomes effectively void
2750+
// This prevents stack overflow on extremely deeply nested documents
2751+
if self.openElements.count < self.maxNestingDepth {
2752+
self.openElements.append(element)
2753+
}
2754+
// Note: element is still in the DOM, just won't receive children
2755+
// Content will be inserted into the parent element instead
2756+
27432757
return element
27442758
}
27452759

0 commit comments

Comments
 (0)