Skip to content

Commit 9a75c18

Browse files
gatorsmilecloud-fan
authored andcommitted
[SPARK-24542][SQL] UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access arbitrary files
## What changes were proposed in this pull request? UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access arbitrary files. Spark does not have built-in access control. When users use the external access control library, users might bypass them and access the file contents. This PR basically patches the Hive fix to Apache Spark. https://issues.apache.org/jira/browse/HIVE-18879 ## How was this patch tested? A unit test case Author: Xiao Li <[email protected]> Closes apache#21549 from gatorsmile/xpathSecurity.
1 parent 1737d45 commit 9a75c18

File tree

3 files changed

+51
-3
lines changed

3 files changed

+51
-3
lines changed

sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtil.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
import java.io.Reader;
2222

2323
import javax.xml.namespace.QName;
24+
import javax.xml.parsers.DocumentBuilder;
25+
import javax.xml.parsers.DocumentBuilderFactory;
26+
import javax.xml.parsers.ParserConfigurationException;
2427
import javax.xml.xpath.XPath;
2528
import javax.xml.xpath.XPathConstants;
2629
import javax.xml.xpath.XPathExpression;
@@ -37,9 +40,15 @@
3740
* This is based on Hive's UDFXPathUtil implementation.
3841
*/
3942
public class UDFXPathUtil {
43+
public static final String SAX_FEATURE_PREFIX = "http://xml.org/sax/features/";
44+
public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE = "external-general-entities";
45+
public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE = "external-parameter-entities";
46+
private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
47+
private DocumentBuilder builder = null;
4048
private XPath xpath = XPathFactory.newInstance().newXPath();
4149
private ReusableStringReader reader = new ReusableStringReader();
4250
private InputSource inputSource = new InputSource(reader);
51+
4352
private XPathExpression expression = null;
4453
private String oldPath = null;
4554

@@ -65,14 +74,31 @@ public Object eval(String xml, String path, QName qname) throws XPathExpressionE
6574
return null;
6675
}
6776

77+
if (builder == null){
78+
try {
79+
initializeDocumentBuilderFactory();
80+
builder = dbf.newDocumentBuilder();
81+
} catch (ParserConfigurationException e) {
82+
throw new RuntimeException(
83+
"Error instantiating DocumentBuilder, cannot build xml parser", e);
84+
}
85+
}
86+
6887
reader.set(xml);
6988
try {
70-
return expression.evaluate(inputSource, qname);
89+
return expression.evaluate(builder.parse(inputSource), qname);
7190
} catch (XPathExpressionException e) {
7291
throw new RuntimeException("Invalid XML document: " + e.getMessage() + "\n" + xml, e);
92+
} catch (Exception e) {
93+
throw new RuntimeException("Error loading expression '" + oldPath + "'", e);
7394
}
7495
}
7596

97+
private void initializeDocumentBuilderFactory() throws ParserConfigurationException {
98+
dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_GENERAL_ENTITIES_FEATURE, false);
99+
dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_PARAMETER_ENTITIES_FEATURE, false);
100+
}
101+
76102
public Boolean evalBoolean(String xml, String path) throws XPathExpressionException {
77103
return (Boolean) eval(xml, path, XPathConstants.BOOLEAN);
78104
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/UDFXPathUtilSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,27 @@ class UDFXPathUtilSuite extends SparkFunSuite {
7777
assert(ret == "foo")
7878
}
7979

80+
test("embedFailure") {
81+
import org.apache.commons.io.FileUtils
82+
import java.io.File
83+
val secretValue = String.valueOf(Math.random)
84+
val tempFile = File.createTempFile("verifyembed", ".tmp")
85+
tempFile.deleteOnExit()
86+
val fname = tempFile.getAbsolutePath
87+
88+
FileUtils.writeStringToFile(tempFile, secretValue)
89+
90+
val xml =
91+
s"""<?xml version="1.0" encoding="utf-8"?>
92+
|<!DOCTYPE test [
93+
| <!ENTITY embed SYSTEM "$fname">
94+
|]>
95+
|<foo>&embed;</foo>
96+
""".stripMargin
97+
val evaled = new UDFXPathUtil().evalString(xml, "/foo")
98+
assert(evaled.isEmpty)
99+
}
100+
80101
test("number eval") {
81102
var ret =
82103
util.evalNumber("<a><b>true</b><b>false</b><b>b3</b><c>c1</c><c>-77</c></a>", "a/c[2]")

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/xml/XPathExpressionSuite.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,9 @@ class XPathExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
4040

4141
// Test error message for invalid XML document
4242
val e1 = intercept[RuntimeException] { testExpr("<a>/a>", "a", null.asInstanceOf[T]) }
43-
assert(e1.getCause.getMessage.contains("Invalid XML document") &&
44-
e1.getCause.getMessage.contains("<a>/a>"))
43+
assert(e1.getCause.getCause.getMessage.contains(
44+
"XML document structures must start and end within the same entity."))
45+
assert(e1.getMessage.contains("<a>/a>"))
4546

4647
// Test error message for invalid xpath
4748
val e2 = intercept[RuntimeException] { testExpr("<a></a>", "!#$", null.asInstanceOf[T]) }

0 commit comments

Comments
 (0)