Skip to content

Commit 6426a75

Browse files
committed
PDFBOX-6130: parse XMP files without processing instructions; improve javadoc; fix problem with attributes that had no prefix; put prefix into QName (for debugging); add test
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1930761 13f79535-47bb-0310-9956-ffa450edef68
1 parent f09e230 commit 6426a75

File tree

2 files changed

+147
-14
lines changed

2 files changed

+147
-14
lines changed

xmpbox/src/main/java/org/apache/xmpbox/xml/DomXmpParser.java

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,10 @@ public boolean isStrictParsing()
110110
/**
111111
* Enable or disable strict parsing mode.
112112
*
113-
* @param strictParsing Whether to be strict when parsing XMP: true (the default) means that
114-
* malformed XMP will result in an exception, false means that if malformed content is
115-
* encountered, the parser will continue its work if possible.
113+
* @param strictParsing Whether to be strict or lenient when parsing XMP. True (the default)
114+
* means that malformed XMP will result in an exception, false (lenient) means that if malformed
115+
* content is encountered, the parser will continue its work if possible. Use strict mode if you
116+
* want to work with PDF/A files. Use lenient mode if you care more about getting metadata.
116117
*/
117118
public void setStrictParsing(boolean strictParsing)
118119
{
@@ -148,7 +149,17 @@ public XMPMetadata parse(InputStream input) throws XmpParsingException
148149
// expect xpacket processing instruction
149150
if (!(node instanceof ProcessingInstruction))
150151
{
151-
throw new XmpParsingException(ErrorType.XpacketBadStart, "xmp should start with a processing instruction");
152+
if (strictParsing)
153+
{
154+
throw new XmpParsingException(ErrorType.XpacketBadStart, "xmp should start with a processing instruction");
155+
}
156+
else
157+
{
158+
xmp = XMPMetadata.createXMPMetadata(XmpConstants.DEFAULT_XPACKET_BEGIN,
159+
XmpConstants.DEFAULT_XPACKET_ID,
160+
XmpConstants.DEFAULT_XPACKET_BYTES,
161+
XmpConstants.DEFAULT_XPACKET_ENCODING);
162+
}
152163
}
153164
else
154165
{
@@ -175,7 +186,14 @@ public XMPMetadata parse(InputStream input) throws XmpParsingException
175186
// expect xpacket end
176187
if (!(node instanceof ProcessingInstruction))
177188
{
178-
throw new XmpParsingException(ErrorType.XpacketBadEnd, "xmp should end with a processing instruction");
189+
if (strictParsing)
190+
{
191+
throw new XmpParsingException(ErrorType.XpacketBadEnd, "xmp should end with a processing instruction");
192+
}
193+
else
194+
{
195+
xmp.setEndXPacket(XmpConstants.DEFAULT_XPACKET_END);
196+
}
179197
}
180198
else
181199
{
@@ -188,7 +206,7 @@ public XMPMetadata parse(InputStream input) throws XmpParsingException
188206
throw new XmpParsingException(ErrorType.XpacketBadEnd,
189207
"xmp should end after xpacket end processing instruction");
190208
}
191-
// xpacket is OK and the is no more nodes
209+
// xpacket is OK and there are no more nodes
192210
// Now, parse the content of root
193211
Element rdfRdf = findDescriptionsParent(root);
194212
nsFinder.push(rdfRdf); // PDFBOX-6099: push namespaces in rdf:RDF
@@ -351,7 +369,7 @@ private void parseDescriptionRootAttr(XMPMetadata xmp, Element description, Attr
351369
{
352370
ComplexPropertyContainer container = schema.getContainer();
353371
PropertyType type = checkPropertyDefinition(xmp,
354-
new QName(attr.getNamespaceURI(), attr.getLocalName()));
372+
new QName(attr.getNamespaceURI(), attr.getLocalName(), attr.getPrefix()));
355373

356374
if (type == null)
357375
{
@@ -746,8 +764,11 @@ else if (XmpConstants.DEFAULT_RDF_PREFIX.equals(attr.getPrefix())
746764
((XMPSchema) sp).setAboutAsSimple(attr.getValue());
747765
}
748766
}
749-
else
767+
else if (XMLConstants.XML_NS_URI.equals(attr.getNamespaceURI()))
750768
{
769+
// This part was the fallback before PDFBOX-6130, now restricted:
770+
// Do not load "ordinary" attributes here because these will be handled by
771+
// tryParseAttributesAsProperties() and parseDescriptionRootAttr()
751772
Attribute attribute = new Attribute(XMLConstants.XML_NS_URI, attr.getLocalName(), attr.getValue());
752773
sp.setAttribute(attribute);
753774
}
@@ -949,12 +970,20 @@ private void parseEndPacket(XMPMetadata metadata, ProcessingInstruction pi) thro
949970

950971
private Element findDescriptionsParent(Element root) throws XmpParsingException
951972
{
952-
Element rdfRdf;
973+
Element rdfRdf = null;
953974
// check if already rdf element, as xmpmeta wrapper can be optional
954975
if (!XmpConstants.RDF_NAMESPACE.equals(root.getNamespaceURI()))
955976
{
956977
// always <x:xmpmeta xmlns:x="adobe:ns:meta/">
957-
expectNaming(root, "adobe:ns:meta/", "x", "xmpmeta");
978+
if (!strictParsing && "xapmeta".equals(root.getLocalName()))
979+
{
980+
// older XMP content
981+
expectNaming(root, "adobe:ns:meta/", "x", "xapmeta");
982+
}
983+
else
984+
{
985+
expectNaming(root, "adobe:ns:meta/", "x", "xmpmeta");
986+
}
958987
// should only have one child
959988
NodeList nl = root.getChildNodes();
960989
if (nl.getLength() == 0)
@@ -965,14 +994,25 @@ private Element findDescriptionsParent(Element root) throws XmpParsingException
965994
else if (nl.getLength() > 1)
966995
{
967996
// only expect one element
968-
throw new XmpParsingException(ErrorType.Format, "More than one element found in x:xmpmeta");
997+
if (strictParsing)
998+
{
999+
throw new XmpParsingException(ErrorType.Format, "More than one element found in x:xmpmeta");
1000+
}
1001+
}
1002+
// find element (there may be a text before the element)
1003+
for (int i = 0; i < nl.getLength(); ++i)
1004+
{
1005+
if (nl.item(i) instanceof Element)
1006+
{
1007+
rdfRdf = (Element) nl.item(i);
1008+
break;
1009+
}
9691010
}
970-
else if (!(root.getFirstChild() instanceof Element))
1011+
if (rdfRdf == null)
9711012
{
9721013
// should be an element
9731014
throw new XmpParsingException(ErrorType.Format, "x:xmpmeta does not contains rdf:RDF element");
9741015
} // else let's parse
975-
rdfRdf = (Element) root.getFirstChild();
9761016
}
9771017
else
9781018
{

xmpbox/src/test/java/org/apache/xmpbox/xml/DomXmpParserTest.java

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
import org.apache.xmpbox.schema.ExifSchema;
3636
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
3737
import org.apache.xmpbox.schema.PhotoshopSchema;
38+
import org.apache.xmpbox.schema.TiffSchema;
39+
import org.apache.xmpbox.schema.XMPBasicSchema;
3840
import org.apache.xmpbox.schema.XMPMediaManagementSchema;
3941
import org.apache.xmpbox.schema.XMPPageTextSchema;
4042
import org.apache.xmpbox.schema.XMPSchema;
@@ -661,6 +663,10 @@ void testBadLocalName() throws XmpParsingException
661663
XmpParsingException.class,
662664
() -> xmpParser.parse(s.getBytes(StandardCharsets.UTF_8)));
663665
assertEquals("Expecting local name 'xmpmeta' and found 'xapmeta'", ex.getMessage());
666+
DomXmpParser xmpParser2 = new DomXmpParser();
667+
xmpParser2.setStrictParsing(false);
668+
XMPMetadata xmp2 = xmpParser2.parse(s.getBytes(StandardCharsets.UTF_8));
669+
assertEquals(0, xmp2.getAllSchemas().size());
664670
}
665671

666672
@Test
@@ -1259,4 +1265,91 @@ void testLenientPdfaExtension() throws XmpParsingException
12591265
assertEquals("uuid:0b306144-6a43-dcbd-6b3e-c6b6b1df873d", xmpMediaManagementSchema.getInstanceID());
12601266
assertEquals("uuid:0b306144-6a43-dcbd-6b3e-c6b6b1df873d", xmpMediaManagementSchema.getDocumentID());
12611267
}
1262-
}
1268+
1269+
@Test
1270+
void testNoProcessingInstruction() throws XmpParsingException, TransformerException
1271+
{
1272+
// From file 000163.pdf
1273+
// Coastal Services Magazine Volume 11_6 November/December
1274+
String s = "<x:xmpmeta xmlns:x=\"adobe:ns:meta/\" x:xmptk=\"Adobe XMP Core 4.1-c037 46.282696, Mon Apr 02 2007 18:36:42 \">\n" +
1275+
" <rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n" +
1276+
" <rdf:Description rdf:about=\"\"\n" +
1277+
" xmlns:xapMM=\"http://ns.adobe.com/xap/1.0/mm/\"\n" +
1278+
" xmlns:stRef=\"http://ns.adobe.com/xap/1.0/sType/ResourceRef#\"\n" +
1279+
" xmlns:tiff=\"http://ns.adobe.com/tiff/1.0/\"\n" +
1280+
" xmlns:xap=\"http://ns.adobe.com/xap/1.0/\"\n" +
1281+
" xmlns:exif=\"http://ns.adobe.com/exif/1.0/\"\n" +
1282+
" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n" +
1283+
" xmlns:photoshop=\"http://ns.adobe.com/photoshop/1.0/\"\n" +
1284+
" xapMM:DocumentID=\"uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143\"\n" +
1285+
" xapMM:InstanceID=\"uuid:7A28FBF56920DA11B4BBB356C0A5C72B\"\n" +
1286+
" tiff:Orientation=\"1\"\n" +
1287+
" tiff:XResolution=\"3050000/10000\"\n" +
1288+
" tiff:YResolution=\"3050000/10000\"\n" +
1289+
" tiff:ResolutionUnit=\"2\"\n" +
1290+
" tiff:NativeDigest=\"123456\"\n" +
1291+
" xap:ModifyDate=\"2005-09-08T09:13:10-04:00\"\n" +
1292+
" xap:CreatorTool=\"Adobe Photoshop CS2 Windows\"\n" +
1293+
" xap:CreateDate=\"2005-08-02T13:47:24-04:00\"\n" +
1294+
" xap:MetadataDate=\"2005-09-08T09:13:10-04:00\"\n" +
1295+
" exif:ColorSpace=\"-1\"\n" +
1296+
" exif:PixelXDimension=\"1525\"\n" +
1297+
" exif:PixelYDimension=\"387\"\n" +
1298+
" exif:NativeDigest=\"12345678\"\n" +
1299+
" dc:format=\"image/tiff\"\n" +
1300+
" photoshop:ColorMode=\"4\"\n" +
1301+
" photoshop:ICCProfile=\"U.S. Web Coated (SWOP) v2\"\n" +
1302+
" photoshop:History=\"\">\n" +
1303+
" <xapMM:DerivedFrom\n" +
1304+
" stRef:instanceID=\"adobe:docid:photoshop:28ff3dc5-4801-11d8-85d1-bb49d244e2ef\"\n" +
1305+
" stRef:documentID=\"adobe:docid:photoshop:28ff3dc5-4801-11d8-85d1-bb49d244e2ef\"/>\n" +
1306+
" </rdf:Description>\n" +
1307+
" </rdf:RDF>\n" +
1308+
"</x:xmpmeta>";
1309+
final DomXmpParser xmpParser1 = new DomXmpParser();
1310+
XmpParsingException ex = assertThrows(XmpParsingException.class,
1311+
() -> xmpParser1.parse(s.getBytes(StandardCharsets.UTF_8)));
1312+
assertEquals("xmp should start with a processing instruction", ex.getMessage());
1313+
DomXmpParser xmpParser2 = new DomXmpParser();
1314+
xmpParser2.setStrictParsing(false);
1315+
XMPMetadata xmp2 = xmpParser2.parse(s.getBytes(StandardCharsets.UTF_8));
1316+
DublinCoreSchema dublinCoreSchema = xmp2.getDublinCoreSchema();
1317+
assertEquals("image/tiff", dublinCoreSchema.getFormat());
1318+
XMPMediaManagementSchema xmpMediaManagementSchema = xmp2.getXMPMediaManagementSchema();
1319+
assertEquals("uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143", xmpMediaManagementSchema.getDocumentID());
1320+
TiffSchema tiffSchema = (TiffSchema) xmp2.getSchema(TiffSchema.class);
1321+
assertEquals("[Orientation=IntegerType:1]", tiffSchema.getProperty(TiffSchema.ORIENTATION).toString());
1322+
PhotoshopSchema photoshopSchema = xmp2.getPhotoshopSchema();
1323+
assertEquals((Integer) 4, photoshopSchema.getColorMode());
1324+
ExifSchema exifSchema = (ExifSchema) xmp2.getSchema(ExifSchema.class);
1325+
assertEquals("[PixelXDimension=IntegerType:1525]", exifSchema.getProperty(ExifSchema.PIXEL_X_DIMENSION).toString());
1326+
XMPBasicSchema xmpBasicSchema = xmp2.getXMPBasicSchema();
1327+
assertEquals("Adobe Photoshop CS2 Windows", xmpBasicSchema.getCreatorTool());
1328+
XmpSerializer serializer = new XmpSerializer();
1329+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
1330+
serializer.serialize(xmp2, baos, true);
1331+
// check that there are no isolated properties
1332+
// (Happened before the change at the bottom of loadAttributes())
1333+
String s2 = baos.toString(StandardCharsets.UTF_8);
1334+
assertFalse(s2.contains(" ColorMode="));
1335+
assertFalse(s2.contains(" CreateDate="));
1336+
assertFalse(s2.contains(" CreatorTool="));
1337+
assertFalse(s2.contains(" DocumentID="));
1338+
// now make sure that parsing again still brings the same data
1339+
DomXmpParser xmpParser3 = new DomXmpParser();
1340+
xmpParser3.setStrictParsing(false);
1341+
XMPMetadata xmp3 = xmpParser3.parse(baos.toByteArray());
1342+
DublinCoreSchema dublinCoreSchema3 = xmp3.getDublinCoreSchema();
1343+
assertEquals("image/tiff", dublinCoreSchema3.getFormat());
1344+
XMPMediaManagementSchema xmpMediaManagementSchema3 = xmp3.getXMPMediaManagementSchema();
1345+
assertEquals("uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143", xmpMediaManagementSchema3.getDocumentID());
1346+
TiffSchema tiffSchema3 = (TiffSchema) xmp3.getSchema(TiffSchema.class);
1347+
assertEquals("[Orientation=IntegerType:1]", tiffSchema3.getProperty(TiffSchema.ORIENTATION).toString());
1348+
PhotoshopSchema photoshopSchema3 = xmp3.getPhotoshopSchema();
1349+
assertEquals((Integer) 4, photoshopSchema3.getColorMode());
1350+
ExifSchema exifSchema3 = (ExifSchema) xmp3.getSchema(ExifSchema.class);
1351+
assertEquals("[PixelXDimension=IntegerType:1525]", exifSchema3.getProperty(ExifSchema.PIXEL_X_DIMENSION).toString());
1352+
XMPBasicSchema xmpBasicSchema3 = xmp3.getXMPBasicSchema();
1353+
assertEquals("Adobe Photoshop CS2 Windows", xmpBasicSchema3.getCreatorTool());
1354+
}
1355+
}

0 commit comments

Comments
 (0)