Skip to content

Commit 0964e81

Browse files
authored
Merge pull request #11619 from GlobalDataverseCommunityConsortium/xmlutil
XML Parsing Updates
2 parents 7e16ed3 + 3e42ad6 commit 0964e81

File tree

19 files changed

+838
-102
lines changed

19 files changed

+838
-102
lines changed

doc/release-notes/xmlutil.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The configuration of XML parsers used in Dataverse has been centralized and unused functionality has been turned off to enhance security.

doc/sphinx-guides/source/api/changelog.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ v6.7
1313
- An undocumented :doc:`search` parameter called "show_my_data" has been removed. It was never exercised by tests and is believed to be unused. API users should use the :ref:`api-mydata` API instead.
1414
- /api/datasets/{id}/curationStatus API now includes a JSON object with curation label, createtime, and assigner rather than a string 'label' and it supports a new boolean includeHistory parameter (default false) that returns a JSON array of statuses
1515
- /api/datasets/{id}/listCurationStates includes new columns "Status Set Time" and "Status Set By" columns listing the time the current status was applied and by whom. It also supports the boolean includeHistory parameter.
16-
- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primiarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.
16+
- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.
1717

1818
v6.6
1919
----

src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
2727
import edu.harvard.iq.dataverse.datavariable.VariableMetadataDDIParser;
2828
import edu.harvard.iq.dataverse.search.IndexServiceBean;
29-
29+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
3030
import jakarta.ejb.EJB;
3131
import jakarta.ejb.EJBException;
3232
import jakarta.ejb.Stateless;
@@ -355,13 +355,19 @@ private boolean updateDraftVersion(ArrayList<VariableMetadata> neededToUpdateVM,
355355

356356
private void readXML(InputStream body, Map<Long,VariableMetadata> mapVarToVarMet, Map<Long,VarGroup> varGroupMap) throws XMLStreamException, NullPointerException {
357357

358-
XMLInputFactory factory=XMLInputFactory.newInstance();
358+
XMLInputFactory factory=XmlUtil.getSecureXMLInputFactory();
359359
XMLStreamReader xmlr=factory.createXMLStreamReader(body);
360360

361361
VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser();
362362

363363
vmdp.processDataDscr(xmlr, mapVarToVarMet, varGroupMap);
364-
364+
if (xmlr != null) {
365+
try {
366+
xmlr.close();
367+
} catch (XMLStreamException e) {
368+
logger.warning("XMLStreamException closing XMLStreamReader in readXml");
369+
}
370+
}
365371
}
366372

367373
private boolean newGroups(Map<Long,VarGroup> varGroupMap, FileMetadata fm) {

src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import edu.harvard.iq.dataverse.license.License;
2222
import edu.harvard.iq.dataverse.license.LicenseServiceBean;
2323
import edu.harvard.iq.dataverse.util.StringUtil;
24+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
25+
2426
import java.io.File;
2527
import java.io.FileInputStream;
2628
import java.io.FileNotFoundException;
@@ -121,8 +123,7 @@ public class ImportDDIServiceBean {
121123
// TODO: stop passing the xml source as a string; (it could be huge!) -- L.A. 4.5
122124
// TODO: what L.A. Said.
123125
public DatasetDTO doImport(ImportType importType, String xmlToParse) throws XMLStreamException, ImportException {
124-
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
125-
xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); DatasetDTO datasetDTO = this.initializeDataset();
126+
DatasetDTO datasetDTO = this.initializeDataset();
126127

127128
// Read docDescr and studyDesc into DTO objects.
128129
// TODO: the fileMap is likely not needed.
@@ -147,11 +148,16 @@ public Map<String, String> mapDDI(ImportType importType, String xmlToParse, Data
147148
Map<String, String> filesMap = new HashMap<>();
148149
StringReader reader = new StringReader(xmlToParse);
149150
XMLStreamReader xmlr = null;
150-
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
151-
xmlFactory.setProperty("javax.xml.stream.isCoalescing", true); // allows the parsing of a CDATA segment into a single event
151+
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
152152
xmlr = xmlFactory.createXMLStreamReader(reader);
153153
processDDI(importType, xmlr, datasetDTO, filesMap);
154-
154+
if (xmlr != null) {
155+
try {
156+
xmlr.close();
157+
} catch (XMLStreamException e) {
158+
logger.warning("XMLStreamException closing XMLStreamReader in mapDDI()");
159+
}
160+
}
155161
return filesMap;
156162
}
157163

src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import edu.harvard.iq.dataverse.util.StringUtil;
2424
import edu.harvard.iq.dataverse.util.json.JsonParseException;
2525
import edu.harvard.iq.dataverse.util.json.JsonParser;
26+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
27+
2628
import java.io.File;
2729
import java.io.FileInputStream;
2830
import java.io.FileNotFoundException;
@@ -106,7 +108,7 @@ public void importXML(String xmlToParse, String foreignFormat, DatasetVersion da
106108

107109
try {
108110
reader = new StringReader(xmlToParse);
109-
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
111+
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
110112
xmlr = xmlFactory.createXMLStreamReader(reader);
111113
DatasetDTO datasetDTO = processXML(xmlr, mappingSupported);
112114

@@ -173,7 +175,7 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo
173175

174176
try {
175177
reader = new StringReader(DcXmlToParse);
176-
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
178+
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
177179
xmlr = xmlFactory.createXMLStreamReader(reader);
178180

179181
//while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments
@@ -184,6 +186,13 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo
184186
processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO);
185187
} catch (XMLStreamException ex) {
186188
throw new EJBException("ERROR occurred while parsing XML fragment (" + DcXmlToParse.substring(0, 64) + "...); ", ex);
189+
} finally {
190+
if (xmlr != null) {
191+
try {
192+
xmlr.close();
193+
} catch (XMLStreamException ex) {
194+
}
195+
}
187196
}
188197

189198

@@ -555,9 +564,7 @@ public ImportGenericServiceBean() {
555564

556565
public ImportGenericServiceBean(ImportType importType) {
557566
this.importType=importType;
558-
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
559-
xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);
560-
567+
xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
561568
}
562569

563570

@@ -583,21 +590,24 @@ public Map<String, String> mapDCTerms(String xmlToParse, DatasetDTO datasetDTO)
583590
Map<String, String> filesMap = new HashMap<>();
584591
StringReader reader = new StringReader(xmlToParse);
585592
XMLStreamReader xmlr = null;
586-
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
593+
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
587594
xmlr = xmlFactory.createXMLStreamReader(reader);
588595
processDCTerms(xmlr, datasetDTO, filesMap);
589-
596+
if (xmlr != null) {
597+
try {
598+
xmlr.close();
599+
} catch (XMLStreamException ex) {
600+
}
601+
}
590602
return filesMap;
591603
}
592604

593605

594606
public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
595-
FileInputStream in = null;
596607
XMLStreamReader xmlr = null;
597608
Map<String, String> filesMap = new HashMap<>();
598609

599-
try {
600-
in = new FileInputStream(ddiFile);
610+
try (FileInputStream in = new FileInputStream(ddiFile)) {
601611
xmlr = xmlInputFactory.createXMLStreamReader(in);
602612
processDCTerms( xmlr, datasetDTO , filesMap );
603613
} catch (FileNotFoundException ex) {
@@ -606,14 +616,11 @@ public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
606616
} catch (XMLStreamException ex) {
607617
Logger.getLogger("global").log(Level.SEVERE, null, ex);
608618
throw new EJBException("ERROR occurred in mapDDI.", ex);
619+
} catch (IOException e) {
609620
} finally {
610621
try {
611622
if (xmlr != null) { xmlr.close(); }
612623
} catch (XMLStreamException ex) {}
613-
614-
try {
615-
if (in != null) { in.close();}
616-
} catch (IOException ex) {}
617624
}
618625

619626
return filesMap;

src/main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import edu.harvard.iq.dataverse.authorization.providers.oauth2.OAuth2UserRecord;
1515
import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser;
1616
import edu.harvard.iq.dataverse.util.BundleUtil;
17+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
18+
1719
import java.io.IOException;
1820
import java.io.StringReader;
1921
import java.util.*;
@@ -111,52 +113,50 @@ final protected OAuth2UserRecord getUserRecord(@NotNull String responseBody, @No
111113

112114
@Override
113115
protected ParsedUserResponse parseUserResponse(String responseBody) {
114-
DocumentBuilderFactory dbFact = DocumentBuilderFactory.newInstance();
115116
try ( StringReader reader = new StringReader(responseBody)) {
116-
DocumentBuilder db = dbFact.newDocumentBuilder();
117-
Document doc = db.parse( new InputSource(reader) );
118-
119-
String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names" )
120-
.stream().findFirst().map( Node::getTextContent )
121-
.map( String::trim ).orElse("");
122-
String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
123-
.stream().findFirst().map( Node::getTextContent )
124-
.map( String::trim ).orElse("");
125-
126-
// fallback - try to use the credit-name
127-
if ( (firstName + familyName).equals("") ) {
128-
firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name" )
129-
.stream().findFirst().map( Node::getTextContent )
130-
.map( String::trim ).orElse("");
131-
}
132-
133-
String primaryEmail = getPrimaryEmail(doc);
134-
List<String> emails = getAllEmails(doc);
135-
136-
// make the username up
137-
String username;
138-
if ( primaryEmail.length() > 0 ) {
139-
username = primaryEmail.split("@")[0];
140-
} else {
141-
username = firstName.split(" ")[0] + "." + familyName;
117+
DocumentBuilder db = XmlUtil.getSecureDocumentBuilder();
118+
if (db != null) {
119+
Document doc = db.parse(new InputSource(reader));
120+
121+
String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names")
122+
.stream().findFirst().map(Node::getTextContent)
123+
.map(String::trim).orElse("");
124+
String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
125+
.stream().findFirst().map(Node::getTextContent)
126+
.map(String::trim).orElse("");
127+
128+
// fallback - try to use the credit-name
129+
if ((firstName + familyName).equals("")) {
130+
firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name")
131+
.stream().findFirst().map(Node::getTextContent)
132+
.map(String::trim).orElse("");
133+
}
134+
135+
String primaryEmail = getPrimaryEmail(doc);
136+
List<String> emails = getAllEmails(doc);
137+
138+
// make the username up
139+
String username;
140+
if (primaryEmail.length() > 0) {
141+
username = primaryEmail.split("@")[0];
142+
} else {
143+
username = firstName.split(" ")[0] + "." + familyName;
144+
}
145+
username = username.replaceAll("[^a-zA-Z0-9.]", "");
146+
147+
// returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
148+
// on the access token response.
149+
// Affiliation added after a later call.
150+
final ParsedUserResponse userResponse = new ParsedUserResponse(
151+
new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
152+
userResponse.emails.addAll(emails);
153+
154+
return userResponse;
142155
}
143-
username = username.replaceAll("[^a-zA-Z0-9.]","");
144-
145-
// returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
146-
// on the access token response.
147-
// Affilifation added after a later call.
148-
final ParsedUserResponse userResponse = new ParsedUserResponse(
149-
new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
150-
userResponse.emails.addAll(emails);
151-
152-
return userResponse;
153-
154156
} catch (SAXException ex) {
155157
logger.log(Level.SEVERE, "XML error parsing response body from ORCiD: " + ex.getMessage(), ex);
156158
} catch (IOException ex) {
157159
logger.log(Level.SEVERE, "I/O error parsing response body from ORCiD: " + ex.getMessage(), ex);
158-
} catch (ParserConfigurationException ex) {
159-
logger.log(Level.SEVERE, "While parsing the ORCiD response: Bad parse configuration. " + ex.getMessage(), ex);
160160
}
161161

162162
return null;

src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import edu.harvard.iq.dataverse.util.SystemConfig;
3030
import edu.harvard.iq.dataverse.util.json.JsonUtil;
3131
import edu.harvard.iq.dataverse.util.xml.XmlPrinter;
32+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
3233
import edu.harvard.iq.dataverse.util.xml.XmlWriterUtil;
3334

3435
import java.io.ByteArrayOutputStream;
@@ -41,20 +42,15 @@
4142
import java.util.Map.Entry;
4243
import java.util.logging.Level;
4344
import java.util.logging.Logger;
44-
import jakarta.ejb.EJB;
45-
import jakarta.json.Json;
4645
import jakarta.json.JsonArray;
47-
import jakarta.json.JsonArrayBuilder;
4846
import jakarta.json.JsonObject;
4947
import jakarta.json.JsonString;
5048
import jakarta.json.JsonValue;
5149
import javax.xml.stream.XMLOutputFactory;
5250
import javax.xml.stream.XMLStreamException;
5351
import javax.xml.stream.XMLStreamWriter;
54-
52+
import javax.xml.XMLConstants;
5553
import javax.xml.parsers.DocumentBuilder;
56-
import javax.xml.parsers.DocumentBuilderFactory;
57-
import javax.xml.parsers.ParserConfigurationException;
5854
import org.xml.sax.SAXException;
5955
import org.w3c.dom.Document;
6056
import org.apache.commons.lang3.StringUtils;
@@ -2012,17 +2008,24 @@ private static void createFileDscr(XMLStreamWriter xmlw, JsonArray fileDetails)
20122008

20132009

20142010
public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStream) throws XMLStreamException {
2015-
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
20162011

20172012
try {
2018-
Document document;
2019-
InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");
2013+
// Get secure DocumentBuilder from our utility class
2014+
DocumentBuilder builder = XmlUtil.getSecureDocumentBuilder();
2015+
if (builder == null) {
2016+
logger.severe("Could not create secure document builder");
2017+
return;
2018+
}
2019+
InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");
20202020

2021-
DocumentBuilder builder = factory.newDocumentBuilder();
2022-
document = builder.parse(datafile);
2021+
Document document = builder.parse(datafile);
20232022

20242023
// Use a Transformer for output
20252024
TransformerFactory tFactory = TransformerFactory.newInstance();
2025+
// Set secure processing feature
2026+
tFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
2027+
tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
2028+
20262029
StreamSource stylesource = new StreamSource(styleSheetInput);
20272030
Transformer transformer = tFactory.newTransformer(stylesource);
20282031

@@ -2035,20 +2038,14 @@ public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStrea
20352038
} catch (TransformerException te) {
20362039
// Error generated by the parser
20372040
logger.severe("Transformation error" + " " + te.getMessage());
2038-
20392041
} catch (SAXException sxe) {
20402042
// Error generated by this application
20412043
// (or a parser-initialization error)
20422044
logger.severe("SAX error " + sxe.getMessage());
2043-
2044-
} catch (ParserConfigurationException pce) {
2045-
// Parser with specified options can't be built
2046-
logger.severe("Parser configuration error " + pce.getMessage());
20472045
} catch (IOException ioe) {
20482046
// I/O error
20492047
logger.warning("I/O error " + ioe.getMessage());
20502048
}
2051-
20522049
}
20532050

20542051
public static void injectSettingsService(SettingsServiceBean settingsSvc) {

src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
package edu.harvard.iq.dataverse.harvest.client;
2121

2222
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
23+
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
24+
2325
import java.io.IOException;
2426

2527
import java.io.InputStream;
@@ -126,7 +128,7 @@ public boolean isDeleted () {
126128
public void harvestRecord(String baseURL, String identifier, String metadataPrefix, Map<String,String> customHeaders, HttpClient httpClient) throws IOException,
127129
ParserConfigurationException, SAXException, TransformerException{
128130

129-
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
131+
xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
130132
String requestURL = getRequestURL(baseURL, identifier, metadataPrefix);
131133
InputStream in;
132134

0 commit comments

Comments
 (0)