Skip to content

Commit 6f400f2

Browse files
committed
refactor the xml tools
Signed-off-by: sezen.leblay <[email protected]>
1 parent c67a062 commit 6f400f2

File tree

13 files changed

+337
-1499
lines changed

13 files changed

+337
-1499
lines changed
Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
package datadog.trace.bootstrap.instrumentation;
2+
3+
import java.io.StringReader;
4+
import java.util.ArrayList;
5+
import java.util.Collections;
6+
import java.util.HashMap;
7+
import java.util.List;
8+
import java.util.Map;
9+
import javax.xml.parsers.DocumentBuilder;
10+
import javax.xml.parsers.DocumentBuilderFactory;
11+
import org.w3c.dom.Attr;
12+
import org.w3c.dom.Document;
13+
import org.w3c.dom.Element;
14+
import org.w3c.dom.NamedNodeMap;
15+
import org.w3c.dom.Node;
16+
import org.w3c.dom.NodeList;
17+
import org.w3c.dom.Text;
18+
import org.xml.sax.InputSource;
19+
20+
/**
21+
* Utility class for converting W3C DOM XML structures to Map/List representations that are
22+
* compatible with WAF analysis and schema extraction.
23+
*
24+
* <p>This centralized utility eliminates code duplication across multiple instrumentation modules
25+
* that need to process XML content for AppSec analysis.
26+
*/
27+
public final class XmlDomUtils {
28+
29+
private XmlDomUtils() {
30+
// Utility class - prevent instantiation
31+
}
32+
33+
/**
34+
* Convert a W3C DOM Document to a WAF-compatible Map/List structure.
35+
*
36+
* @param document the XML document to convert
37+
* @param maxRecursion maximum recursion depth to prevent stack overflow
38+
* @return converted structure wrapped in a list for consistency, or null if document is null
39+
*/
40+
public static Object convertDocument(Document document, int maxRecursion) {
41+
if (document == null) {
42+
return null;
43+
}
44+
45+
Element documentElement = document.getDocumentElement();
46+
if (documentElement == null) {
47+
return null;
48+
}
49+
50+
Object converted = convertW3cNode(documentElement, maxRecursion);
51+
// Wrap in a list for consistency with other XML processing patterns
52+
return converted != null ? Collections.singletonList(converted) : null;
53+
}
54+
55+
/**
56+
* Convert a W3C DOM Element to a WAF-compatible Map/List structure.
57+
*
58+
* @param element the XML element to convert
59+
* @param maxRecursion maximum recursion depth to prevent stack overflow
60+
* @return converted structure wrapped in a list for consistency, or null if element is null
61+
*/
62+
public static Object convertElement(Element element, int maxRecursion) {
63+
if (element == null) {
64+
return null;
65+
}
66+
67+
Object converted = convertW3cNode(element, maxRecursion);
68+
// Wrap in a list for consistency with other XML processing patterns
69+
return converted != null ? Collections.singletonList(converted) : null;
70+
}
71+
72+
/**
73+
* Convert a W3C DOM Node to a WAF-compatible Map/List structure.
74+
*
75+
* <p>This method recursively processes XML nodes, converting: - Elements to Maps with
76+
* "attributes" and "children" keys - Text nodes to their trimmed string content - Other node
77+
* types are ignored (return null)
78+
*
79+
* @param node the XML node to convert
80+
* @param maxRecursion maximum recursion depth to prevent stack overflow
81+
* @return Map for elements, String for text nodes, null for other types or when maxRecursion <= 0
82+
*/
83+
public static Object convertW3cNode(Node node, int maxRecursion) {
84+
if (node == null || maxRecursion <= 0) {
85+
return null;
86+
}
87+
88+
if (node instanceof Element) {
89+
return convertElementNode((Element) node, maxRecursion);
90+
} else if (node instanceof Text) {
91+
return convertTextNode((Text) node);
92+
}
93+
94+
// Ignore other node types (comments, processing instructions, etc.)
95+
return null;
96+
}
97+
98+
/** Convert an Element node to a Map with attributes and children. */
99+
private static Map<String, Object> convertElementNode(Element element, int maxRecursion) {
100+
Map<String, String> attributes = Collections.emptyMap();
101+
if (element.hasAttributes()) {
102+
attributes = new HashMap<>();
103+
NamedNodeMap attrMap = element.getAttributes();
104+
for (int i = 0; i < attrMap.getLength(); i++) {
105+
Attr item = (Attr) attrMap.item(i);
106+
attributes.put(item.getName(), item.getValue());
107+
}
108+
}
109+
110+
List<Object> children = Collections.emptyList();
111+
if (element.hasChildNodes()) {
112+
NodeList childNodes = element.getChildNodes();
113+
children = new ArrayList<>(childNodes.getLength());
114+
for (int i = 0; i < childNodes.getLength(); i++) {
115+
Node item = childNodes.item(i);
116+
Object childResult = convertW3cNode(item, maxRecursion - 1);
117+
if (childResult != null) {
118+
children.add(childResult);
119+
}
120+
}
121+
}
122+
123+
Map<String, Object> repr = new HashMap<>();
124+
if (!attributes.isEmpty()) {
125+
repr.put("attributes", attributes);
126+
}
127+
if (!children.isEmpty()) {
128+
repr.put("children", children);
129+
}
130+
return repr;
131+
}
132+
133+
/** Convert a Text node to its trimmed string content. */
134+
private static String convertTextNode(Text textNode) {
135+
String textContent = textNode.getTextContent();
136+
if (textContent != null) {
137+
textContent = textContent.trim();
138+
if (!textContent.isEmpty()) {
139+
return textContent;
140+
}
141+
}
142+
return null;
143+
}
144+
145+
/**
146+
* Check if a string contains XML content by examining both strings and DOM objects.
147+
*
148+
* @param obj the object to check
149+
* @return true if the object contains XML content, false otherwise
150+
*/
151+
public static boolean isXmlContent(Object obj) {
152+
if (obj == null) {
153+
return false;
154+
}
155+
156+
// Check for W3C DOM XML objects
157+
if (obj instanceof Document || obj instanceof Element || obj instanceof Node) {
158+
return true;
159+
}
160+
161+
// Check for XML string content
162+
if (obj instanceof String) {
163+
String content = (String) obj;
164+
if (content.trim().isEmpty()) {
165+
return false;
166+
}
167+
String trimmed = content.trim();
168+
169+
// Explicitly exclude JSON content
170+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
171+
return false;
172+
}
173+
174+
// Check for XML declaration
175+
if (trimmed.startsWith("<?xml")) {
176+
return true;
177+
}
178+
179+
// Check for XML element (must start with < and end with >, and contain at least one closing
180+
// tag)
181+
if (trimmed.startsWith("<") && trimmed.endsWith(">") && trimmed.contains("</")) {
182+
return true;
183+
}
184+
}
185+
186+
return false;
187+
}
188+
189+
/**
190+
* Check if a string contains XML content by looking for XML declaration or root element.
191+
*
192+
* @param content the string content to check
193+
* @return true if the string contains XML content, false otherwise
194+
*/
195+
public static boolean isXmlContent(String content) {
196+
if (content == null || content.trim().isEmpty()) {
197+
return false;
198+
}
199+
String trimmed = content.trim();
200+
201+
// Explicitly exclude JSON content
202+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
203+
return false;
204+
}
205+
206+
return trimmed.startsWith("<?xml")
207+
|| (trimmed.startsWith("<") && trimmed.endsWith(">") && trimmed.contains("</"));
208+
}
209+
210+
/**
211+
* Process XML content (strings or DOM objects) for WAF compatibility. This ensures XML attack
212+
* payloads are properly detected by the WAF.
213+
*
214+
* @param xmlObj the XML object to process (can be Document, Element, Node, or String)
215+
* @param maxRecursion maximum recursion depth to prevent stack overflow
216+
* @return processed XML structure compatible with WAF analysis, or null if processing fails
217+
*/
218+
public static Object processXmlForWaf(Object xmlObj, int maxRecursion) {
219+
if (xmlObj == null) {
220+
return null;
221+
}
222+
223+
// Handle W3C DOM objects directly
224+
if (xmlObj instanceof Document) {
225+
return convertDocument((Document) xmlObj, maxRecursion);
226+
}
227+
228+
if (xmlObj instanceof Element) {
229+
return convertElement((Element) xmlObj, maxRecursion);
230+
}
231+
232+
if (xmlObj instanceof Node) {
233+
Object converted = convertW3cNode((Node) xmlObj, maxRecursion);
234+
return converted != null ? Collections.singletonList(converted) : null;
235+
}
236+
237+
// Handle XML strings by parsing them first
238+
if (xmlObj instanceof String) {
239+
try {
240+
return parseXmlStringToWafFormat((String) xmlObj, maxRecursion);
241+
} catch (Exception e) {
242+
// Return null if parsing fails - let caller handle logging
243+
return null;
244+
}
245+
}
246+
247+
return null;
248+
}
249+
250+
/**
251+
* Convert XML string to WAF-compatible format following Spring framework pattern. This ensures
252+
* XML attack payloads are properly detected by the WAF.
253+
*
254+
* @param xmlContent the XML string content to parse
255+
* @param maxRecursion maximum recursion depth to prevent stack overflow
256+
* @return parsed XML structure compatible with WAF analysis
257+
* @throws Exception if XML parsing fails
258+
*/
259+
public static Object parseXmlStringToWafFormat(String xmlContent, int maxRecursion)
260+
throws Exception {
261+
if (xmlContent == null || xmlContent.trim().isEmpty()) {
262+
return null;
263+
}
264+
265+
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
266+
// Security settings to prevent XXE attacks during parsing
267+
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
268+
factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
269+
factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
270+
factory.setExpandEntityReferences(false);
271+
272+
DocumentBuilder builder = factory.newDocumentBuilder();
273+
Document document = builder.parse(new InputSource(new StringReader(xmlContent)));
274+
275+
return convertDocument(document, maxRecursion);
276+
}
277+
278+
/**
279+
* Convert XML string to WAF-compatible format. This is a convenience method that wraps
280+
* parseXmlStringToWafFormat and handles exceptions internally.
281+
*
282+
* @param xmlContent the XML string content to handle
283+
* @param maxRecursion maximum recursion depth to prevent stack overflow
284+
* @return parsed XML structure compatible with WAF analysis, or null if parsing fails
285+
*/
286+
public static Object handleXmlString(String xmlContent, int maxRecursion) {
287+
try {
288+
return parseXmlStringToWafFormat(xmlContent, maxRecursion);
289+
} catch (Exception e) {
290+
// Return null if parsing fails - let caller handle logging
291+
return null;
292+
}
293+
}
294+
}

0 commit comments

Comments
 (0)