Skip to content

Commit 876f7bf

Browse files
committed
refactor the xml tools
Signed-off-by: sezen.leblay <[email protected]>
1 parent c67a062 commit 876f7bf

File tree

14 files changed

+754
-1499
lines changed

14 files changed

+754
-1499
lines changed
Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
package datadog.trace.bootstrap.instrumentation;
2+
3+
import java.io.StringReader;
4+
import java.util.ArrayList;
5+
import java.util.Collections;
6+
import java.util.HashMap;
7+
import java.util.List;
8+
import java.util.Map;
9+
import javax.xml.parsers.DocumentBuilder;
10+
import javax.xml.parsers.DocumentBuilderFactory;
11+
import org.w3c.dom.Attr;
12+
import org.w3c.dom.Document;
13+
import org.w3c.dom.Element;
14+
import org.w3c.dom.NamedNodeMap;
15+
import org.w3c.dom.Node;
16+
import org.w3c.dom.NodeList;
17+
import org.w3c.dom.Text;
18+
import org.xml.sax.InputSource;
19+
20+
/**
21+
* Utility class for converting W3C DOM XML structures to Map/List representations that are
22+
* compatible with WAF analysis and schema extraction.
23+
*
24+
* <p>This centralized utility eliminates code duplication across multiple instrumentation modules
25+
* that need to process XML content for AppSec analysis.
26+
*/
27+
public final class XmlDomUtils {
28+
29+
private XmlDomUtils() {
30+
// Utility class - prevent instantiation
31+
}
32+
33+
/**
34+
* Convert a W3C DOM Document to a WAF-compatible Map/List structure.
35+
*
36+
* @param document the XML document to convert
37+
* @param maxRecursion maximum recursion depth to prevent stack overflow
38+
* @return converted structure wrapped in a list for consistency, or null if document is null
39+
*/
40+
public static Object convertDocument(Document document, int maxRecursion) {
41+
if (document == null) {
42+
return null;
43+
}
44+
45+
Element documentElement = document.getDocumentElement();
46+
if (documentElement == null) {
47+
return null;
48+
}
49+
50+
Object converted = convertW3cNode(documentElement, maxRecursion);
51+
// Wrap in a list for consistency with other XML processing patterns
52+
return converted != null ? Collections.singletonList(converted) : null;
53+
}
54+
55+
/**
56+
* Convert a W3C DOM Element to a WAF-compatible Map/List structure.
57+
*
58+
* @param element the XML element to convert
59+
* @param maxRecursion maximum recursion depth to prevent stack overflow
60+
* @return converted structure wrapped in a list for consistency, or null if element is null
61+
*/
62+
public static Object convertElement(Element element, int maxRecursion) {
63+
if (element == null) {
64+
return null;
65+
}
66+
67+
Object converted = convertW3cNode(element, maxRecursion);
68+
// Wrap in a list for consistency with other XML processing patterns
69+
return converted != null ? Collections.singletonList(converted) : null;
70+
}
71+
72+
/**
73+
* Convert a W3C DOM Node to a WAF-compatible Map/List structure.
74+
*
75+
* <p>This method recursively processes XML nodes, converting: - Elements to Maps with
76+
* "attributes" and "children" keys - Text nodes to their trimmed string content - Other node
77+
* types are ignored (return null)
78+
*
79+
* @param node the XML node to convert
80+
* @param maxRecursion maximum recursion depth to prevent stack overflow
81+
* @return Map for elements, String for text nodes, null for other types or when maxRecursion <= 0
82+
*/
83+
public static Object convertW3cNode(Node node, int maxRecursion) {
84+
if (node == null || maxRecursion <= 0) {
85+
return null;
86+
}
87+
88+
if (node instanceof Element) {
89+
return convertElementNode((Element) node, maxRecursion);
90+
} else if (node instanceof Text) {
91+
return convertTextNode((Text) node);
92+
}
93+
94+
// Ignore other node types (comments, processing instructions, etc.)
95+
return null;
96+
}
97+
98+
/** Convert an Element node to a Map with attributes and children. */
99+
private static Map<String, Object> convertElementNode(Element element, int maxRecursion) {
100+
Map<String, String> attributes = Collections.emptyMap();
101+
if (element.hasAttributes()) {
102+
attributes = new HashMap<>();
103+
NamedNodeMap attrMap = element.getAttributes();
104+
for (int i = 0; i < attrMap.getLength(); i++) {
105+
Attr item = (Attr) attrMap.item(i);
106+
attributes.put(item.getName(), item.getValue());
107+
}
108+
}
109+
110+
List<Object> children = Collections.emptyList();
111+
if (element.hasChildNodes()) {
112+
NodeList childNodes = element.getChildNodes();
113+
children = new ArrayList<>(childNodes.getLength());
114+
for (int i = 0; i < childNodes.getLength(); i++) {
115+
Node item = childNodes.item(i);
116+
Object childResult = convertW3cNode(item, maxRecursion - 1);
117+
if (childResult != null) {
118+
children.add(childResult);
119+
}
120+
}
121+
}
122+
123+
Map<String, Object> repr = new HashMap<>();
124+
if (!attributes.isEmpty()) {
125+
repr.put("attributes", attributes);
126+
}
127+
if (!children.isEmpty()) {
128+
repr.put("children", children);
129+
}
130+
return repr;
131+
}
132+
133+
/** Convert a Text node to its trimmed string content. */
134+
private static String convertTextNode(Text textNode) {
135+
String textContent = textNode.getTextContent();
136+
if (textContent != null) {
137+
textContent = textContent.trim();
138+
if (!textContent.isEmpty()) {
139+
return textContent;
140+
}
141+
}
142+
return null;
143+
}
144+
145+
/**
146+
* Check if a string contains XML content by examining both strings and DOM objects.
147+
*
148+
* @param obj the object to check
149+
* @return true if the object contains XML content, false otherwise
150+
*/
151+
public static boolean isXmlContent(Object obj) {
152+
if (obj == null) {
153+
return false;
154+
}
155+
156+
// Check for W3C DOM XML objects
157+
if (obj instanceof Document || obj instanceof Element || obj instanceof Node) {
158+
return true;
159+
}
160+
161+
// Check for XML string content
162+
if (obj instanceof String) {
163+
String content = (String) obj;
164+
if (content.trim().isEmpty()) {
165+
return false;
166+
}
167+
String trimmed = content.trim();
168+
169+
// Explicitly exclude JSON content
170+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
171+
return false;
172+
}
173+
174+
// Check for XML declaration
175+
if (trimmed.startsWith("<?xml")) {
176+
return true;
177+
}
178+
179+
// Check for XML element (must start with < and end with >, and contain at least one closing
180+
// tag or self-closing tag)
181+
if (trimmed.startsWith("<")
182+
&& trimmed.endsWith(">")
183+
&& (trimmed.contains("</") || trimmed.contains("/>"))) {
184+
return true;
185+
}
186+
}
187+
188+
return false;
189+
}
190+
191+
/**
192+
* Check if a string contains XML content by looking for XML declaration or root element.
193+
*
194+
* @param content the string content to check
195+
* @return true if the string contains XML content, false otherwise
196+
*/
197+
public static boolean isXmlContent(String content) {
198+
if (content == null || content.trim().isEmpty()) {
199+
return false;
200+
}
201+
String trimmed = content.trim();
202+
203+
// Explicitly exclude JSON content
204+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
205+
return false;
206+
}
207+
208+
return trimmed.startsWith("<?xml")
209+
|| (trimmed.startsWith("<")
210+
&& trimmed.endsWith(">")
211+
&& (trimmed.contains("</") || trimmed.contains("/>")));
212+
}
213+
214+
/**
215+
* Process XML content (strings or DOM objects) for WAF compatibility. This ensures XML attack
216+
* payloads are properly detected by the WAF.
217+
*
218+
* @param xmlObj the XML object to process (can be Document, Element, Node, or String)
219+
* @param maxRecursion maximum recursion depth to prevent stack overflow
220+
* @return processed XML structure compatible with WAF analysis, or null if processing fails
221+
*/
222+
public static Object processXmlForWaf(Object xmlObj, int maxRecursion) {
223+
if (xmlObj == null) {
224+
return null;
225+
}
226+
227+
// Handle W3C DOM objects directly
228+
if (xmlObj instanceof Document) {
229+
return convertDocument((Document) xmlObj, maxRecursion);
230+
}
231+
232+
if (xmlObj instanceof Element) {
233+
return convertElement((Element) xmlObj, maxRecursion);
234+
}
235+
236+
if (xmlObj instanceof Node) {
237+
Object converted = convertW3cNode((Node) xmlObj, maxRecursion);
238+
return converted != null ? Collections.singletonList(converted) : null;
239+
}
240+
241+
// Handle XML strings by parsing them first
242+
if (xmlObj instanceof String) {
243+
try {
244+
return parseXmlStringToWafFormat((String) xmlObj, maxRecursion);
245+
} catch (Exception e) {
246+
// Return null if parsing fails - let caller handle logging
247+
return null;
248+
}
249+
}
250+
251+
return null;
252+
}
253+
254+
/**
255+
* Convert XML string to WAF-compatible format following Spring framework pattern. This ensures
256+
* XML attack payloads are properly detected by the WAF.
257+
*
258+
* @param xmlContent the XML string content to parse
259+
* @param maxRecursion maximum recursion depth to prevent stack overflow
260+
* @return parsed XML structure compatible with WAF analysis
261+
* @throws Exception if XML parsing fails
262+
*/
263+
public static Object parseXmlStringToWafFormat(String xmlContent, int maxRecursion)
264+
throws Exception {
265+
if (xmlContent == null || xmlContent.trim().isEmpty()) {
266+
return null;
267+
}
268+
269+
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
270+
// Security settings to prevent XXE attacks during parsing
271+
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
272+
factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
273+
factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
274+
factory.setExpandEntityReferences(false);
275+
276+
DocumentBuilder builder = factory.newDocumentBuilder();
277+
Document document = builder.parse(new InputSource(new StringReader(xmlContent)));
278+
279+
return convertDocument(document, maxRecursion);
280+
}
281+
282+
/**
283+
* Convert XML string to WAF-compatible format. This is a convenience method that wraps
284+
* parseXmlStringToWafFormat and handles exceptions internally.
285+
*
286+
* @param xmlContent the XML string content to handle
287+
* @param maxRecursion maximum recursion depth to prevent stack overflow
288+
* @return parsed XML structure compatible with WAF analysis, or null if parsing fails
289+
*/
290+
public static Object handleXmlString(String xmlContent, int maxRecursion) {
291+
try {
292+
return parseXmlStringToWafFormat(xmlContent, maxRecursion);
293+
} catch (Exception e) {
294+
// Return null if parsing fails - let caller handle logging
295+
return null;
296+
}
297+
}
298+
}

0 commit comments

Comments
 (0)