Skip to content

Commit dd14d21

Browse files
committed
PDFBOX-5982: support DP and MP operators
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1924801 13f79535-47bb-0310-9956-ffa450edef68
1 parent 5823cb0 commit dd14d21

File tree

5 files changed

+193
-3
lines changed

5 files changed

+193
-3
lines changed

pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,4 +1182,15 @@ public boolean isShouldProcessColorOperators()
11821182
{
11831183
return shouldProcessColorOperators;
11841184
}
1185+
1186+
/**
1187+
* Handles MP and DP operators.
1188+
*
1189+
* @param tag indicates the role or significance of the sequence
1190+
* @param properties optional properties
1191+
*/
1192+
public void markedContentPoint(COSName tag, COSDictionary properties)
1193+
{
1194+
// overridden in subclasses
1195+
}
11851196
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.pdfbox.contentstream.operator.markedcontent;
18+
19+
import java.io.IOException;
20+
import java.util.List;
21+
import org.apache.pdfbox.contentstream.PDFStreamEngine;
22+
import org.apache.pdfbox.contentstream.operator.MissingOperandException;
23+
import org.apache.pdfbox.contentstream.operator.Operator;
24+
import org.apache.pdfbox.contentstream.operator.OperatorName;
25+
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
26+
import org.apache.pdfbox.cos.COSBase;
27+
import org.apache.pdfbox.cos.COSName;
28+
29+
/**
30+
*
31+
* @author Tilman Hausherr
32+
*/
33+
public class MarkedContentPoint extends OperatorProcessor
34+
{
35+
public MarkedContentPoint(PDFStreamEngine context)
36+
{
37+
super(context);
38+
}
39+
40+
@Override
41+
public void process(Operator operator, List<COSBase> operands) throws IOException
42+
{
43+
if (operands.isEmpty())
44+
{
45+
throw new MissingOperandException(operator, operands);
46+
}
47+
COSBase base0 = operands.get(0);
48+
if (!(base0 instanceof COSName))
49+
{
50+
return;
51+
}
52+
getContext().markedContentPoint((COSName) base0, null);
53+
}
54+
55+
@Override
56+
public String getName()
57+
{
58+
return OperatorName.MARKED_CONTENT_POINT;
59+
}
60+
61+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.pdfbox.contentstream.operator.markedcontent;
18+
19+
import java.io.IOException;
20+
import java.util.List;
21+
import org.apache.pdfbox.contentstream.PDFStreamEngine;
22+
import org.apache.pdfbox.contentstream.operator.MissingOperandException;
23+
import org.apache.pdfbox.contentstream.operator.Operator;
24+
import org.apache.pdfbox.contentstream.operator.OperatorName;
25+
import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
26+
import org.apache.pdfbox.cos.COSBase;
27+
import org.apache.pdfbox.cos.COSDictionary;
28+
import org.apache.pdfbox.cos.COSName;
29+
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList;
30+
31+
/**
32+
*
33+
* @author Tilman Hausherr
34+
*/
35+
public class MarkedContentPointWithProperties extends OperatorProcessor
36+
{
37+
public MarkedContentPointWithProperties(PDFStreamEngine context)
38+
{
39+
super(context);
40+
}
41+
42+
@Override
43+
public void process(Operator operator, List<COSBase> operands) throws IOException
44+
{
45+
if (operands.size() < 2)
46+
{
47+
throw new MissingOperandException(operator, operands);
48+
}
49+
if (!(operands.get(0) instanceof COSName))
50+
{
51+
return;
52+
}
53+
PDFStreamEngine context = getContext();
54+
COSName tag = (COSName) operands.get(0);
55+
COSBase op1 = operands.get(1);
56+
COSDictionary propDict = null;
57+
if (op1 instanceof COSName)
58+
{
59+
PDPropertyList prop = context.getResources().getProperties((COSName) op1);
60+
if (prop != null)
61+
{
62+
propDict = prop.getCOSObject();
63+
}
64+
}
65+
else if (op1 instanceof COSDictionary)
66+
{
67+
propDict = (COSDictionary) op1;
68+
}
69+
if (propDict == null)
70+
{
71+
// wrong type or property not found
72+
return;
73+
}
74+
context.markedContentPoint(tag, propDict);
75+
}
76+
77+
@Override
78+
public String getName()
79+
{
80+
return OperatorName.MARKED_CONTENT_POINT_WITH_PROPS;
81+
}
82+
83+
}

pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,32 @@ public void endMarkedContent() throws IOException
13721372
writeOperator(OperatorName.END_MARKED_CONTENT);
13731373
}
13741374

1375+
/**
1376+
* set a marked content point.
1377+
*
1378+
* @param tag the tag to be added to the content stream
1379+
* @throws IOException If the content stream could not be written
1380+
*/
1381+
public void setMarkedContentPoint(COSName tag) throws IOException
1382+
{
1383+
writeOperand(tag);
1384+
writeOperator(OperatorName.MARKED_CONTENT_POINT);
1385+
}
1386+
1387+
/**
1388+
* Set a marked content point with a reference to an entry in the page resources' Properties dictionary.
1389+
*
1390+
* @param tag the tag to be added to the content stream
1391+
* @param propertyList property list to be added to the content stream
1392+
* @throws IOException If the content stream could not be written
1393+
*/
1394+
public void setMarkedContentPointWithProperties(COSName tag, PDPropertyList propertyList) throws IOException
1395+
{
1396+
writeOperand(tag);
1397+
writeOperand(resources.add(propertyList));
1398+
writeOperator(OperatorName.MARKED_CONTENT_POINT_WITH_PROPS);
1399+
}
1400+
13751401
/**
13761402
* Set an extended graphics state.
13771403
*

pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
3232
import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject;
3333
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
34+
import org.apache.pdfbox.contentstream.operator.markedcontent.MarkedContentPoint;
35+
import org.apache.pdfbox.contentstream.operator.markedcontent.MarkedContentPointWithProperties;
3436

3537
/**
3638
* This is an stream engine to extract the marked content of a pdf.
@@ -45,7 +47,7 @@ public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
4547
private final Map<String, List<TextPosition>> characterListMapping = new HashMap<>();
4648

4749
/**
48-
* Instantiate a new PDFTextStripper object.
50+
* Instantiate a new PDFMarkedContentExtractor object.
4951
*/
5052
public PDFMarkedContentExtractor()
5153
{
@@ -63,8 +65,8 @@ public PDFMarkedContentExtractor(String encoding)
6365
addOperator(new BeginMarkedContentSequence(this));
6466
addOperator(new EndMarkedContentSequence(this));
6567
addOperator(new DrawObject(this));
66-
// todo: DP - Marked Content Point
67-
// todo: MP - Marked Content Point with Properties
68+
addOperator(new MarkedContentPoint(this));
69+
addOperator(new MarkedContentPointWithProperties(this));
6870
}
6971

7072
/**
@@ -129,6 +131,13 @@ public void endMarkedContentSequence()
129131
}
130132
}
131133

134+
@Override
135+
public void markedContentPoint(COSName tag, COSDictionary properties)
136+
{
137+
// Nothing happens here yet. If you know anything useful that should happen, please tell us.
138+
super.markedContentPoint(tag, properties);
139+
}
140+
132141
public void xobject(PDXObject xobject)
133142
{
134143
if (!this.currentMarkedContents.isEmpty())

0 commit comments

Comments
 (0)