Skip to content

Commit 76e1ae1

Browse files
committed
PDFBOX-3774: conditionally ignore spaces from the content stream; add setting + getter/setter + test + code simplification by Kevin Day
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1922535 13f79535-47bb-0310-9956-ffa450edef68
1 parent 090c983 commit 76e1ae1

File tree

2 files changed

+86
-5
lines changed

2 files changed

+86
-5
lines changed

pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine
146146
private boolean shouldSeparateByBeads = true;
147147
private boolean sortByPosition = false;
148148
private boolean addMoreFormatting = false;
149+
private boolean ignoreContentStreamSpaceGlyphs = false;
149150

150151
private float indentThreshold = defaultIndentThreshold;
151152
private float dropThreshold = defaultDropThreshold;
@@ -523,11 +524,8 @@ protected void writePage() throws IOException
523524
{
524525
IterativeMergeSort.sort(textList, comparator);
525526
}
526-
finally
527-
{
528-
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
529-
removeContainedSpaces(textList);
530-
}
527+
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
528+
removeContainedSpaces(textList);
531529
}
532530

533531
startArticle();
@@ -555,6 +553,12 @@ protected void writePage() throws IOException
555553
PositionWrapper current = new PositionWrapper(position);
556554
String characterValue = position.getUnicode();
557555

556+
// PDFBOX-3774: conditionally ignore spaces from the content stream
557+
if (" ".equals(characterValue) && getIgnoreContentStreamSpaceGlyphs())
558+
{
559+
continue;
560+
}
561+
558562
// Resets the average character width when we see a change in font
559563
// or a change in the font size
560564
if (lastPosition != null &&
@@ -1276,6 +1280,32 @@ public void setSortByPosition(boolean newSortByPosition)
12761280
sortByPosition = newSortByPosition;
12771281
}
12781282

1283+
/**
1284+
* Determines whether spaces in the content stream text rendering instructions will be ignored
1285+
* during text extraction.
1286+
*
1287+
* @return true is space glyphs in the content stream text rendering instructions will be
1288+
* ignored - default is false
1289+
*/
1290+
public boolean getIgnoreContentStreamSpaceGlyphs()
1291+
{
1292+
return ignoreContentStreamSpaceGlyphs;
1293+
}
1294+
1295+
/**
1296+
* Instruct the algorithm to ignore any spaces in the text rendering instructions in the content
1297+
* stream, and instead rely purely on the algorithm to determine where word breaks are.
1298+
*
1299+
* This can improve text extraction results where the content stream is sorted by position and
1300+
* has text overlapping spaces, but could cause some word breaks to not be added to the output
1301+
*
1302+
* @param newIgnoreContentStreamSpaceGlyphs whether PDF Box should ignore context stream spaces
1303+
*/
1304+
public void setIgnoreContentStreamSpaceGlyphs(boolean newIgnoreContentStreamSpaceGlyphs)
1305+
{
1306+
ignoreContentStreamSpaceGlyphs = newIgnoreContentStreamSpaceGlyphs;
1307+
}
1308+
12791309
/**
12801310
* Get the current space width-based tolerance value that is being used to estimate where spaces in text should be
12811311
* added. Note that the default value for this has been determined from trial and error.

pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,13 @@
5454
import org.apache.fontbox.util.BoundingBox;
5555
import org.apache.pdfbox.Loader;
5656
import org.apache.pdfbox.pdmodel.PDDocument;
57+
import org.apache.pdfbox.pdmodel.PDPage;
58+
import org.apache.pdfbox.pdmodel.PDPageContentStream;
5759
import org.apache.pdfbox.pdmodel.font.PDFont;
5860
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
61+
import org.apache.pdfbox.pdmodel.font.PDType1Font;
5962
import org.apache.pdfbox.pdmodel.font.PDType3Font;
63+
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
6064
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
6165
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
6266
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
@@ -688,4 +692,51 @@ void testStartEndPage() throws IOException
688692
assertEquals(1378, text.replaceAll("\r", "").length());
689693
}
690694
}
695+
696+
/**
697+
* PDFBOX-3774: test the IgnoreContentStreamSpaceGlyphs option.
698+
*
699+
* @throws Exception
700+
*/
701+
@Test
702+
void testIgnoreContentStreamSpaceGlyphs() throws Exception
703+
{
704+
try (PDDocument doc = new PDDocument())
705+
{
706+
PDPage page = new PDPage();
707+
try (PDPageContentStream cs = new PDPageContentStream(doc, page))
708+
{
709+
float fontHeight = 8;
710+
float x = 50;
711+
float y = page.getMediaBox().getHeight() - 50;
712+
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
713+
cs.beginText();
714+
cs.setFont(font, fontHeight);
715+
cs.newLineAtOffset(x, y);
716+
cs.showText("( )");
717+
cs.endText();
718+
719+
int indent = 6;
720+
float overlapX = x + indent * font.getAverageFontWidth() / 1000f * fontHeight;
721+
PDFont overlapFont = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
722+
cs.beginText();
723+
cs.setFont(overlapFont, fontHeight * 2f);
724+
cs.newLineAtOffset(overlapX, y);
725+
cs.showText("overlap");
726+
cs.endText();
727+
}
728+
doc.addPage(page);
729+
730+
PDFTextStripper localStripper = new PDFTextStripper();
731+
localStripper.setLineSeparator("\n");
732+
localStripper.setPageEnd("\n");
733+
localStripper.setStartPage(1);
734+
localStripper.setEndPage(1);
735+
localStripper.setSortByPosition(true);
736+
737+
localStripper.setIgnoreContentStreamSpaceGlyphs(true);
738+
String text = localStripper.getText(doc);
739+
assertEquals("( overlap )\n", text);
740+
}
741+
}
691742
}

0 commit comments

Comments
 (0)