Skip to content

Commit 374972f

Browse files
committed
PDFBOX-5487: Remove all space characters if contained within the adjacent letters, by Mohamed M NourElDin; closes #155
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1922514 13f79535-47bb-0310-9956-ffa450edef68
1 parent e4f814c commit 374972f

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,11 @@ protected void writePage() throws IOException
523523
{
524524
IterativeMergeSort.sort(textList, comparator);
525525
}
526+
finally
527+
{
528+
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
529+
removeContainedSpaces(textList);
530+
}
526531
}
527532

528533
startArticle();
@@ -724,6 +729,34 @@ private boolean overlap(float y1, float height1, float y2, float height2)
724729
|| y1 <= y2 && y1 >= y2 - height2;
725730
}
726731

732+
/**
733+
* Remove all space characters if contained within the adjacent letters
734+
*/
735+
private void removeContainedSpaces(List<TextPosition> textList)
736+
{
737+
TextPosition position, previousPosition;
738+
Iterator<TextPosition> iterator = textList.iterator();
739+
740+
if (!iterator.hasNext())
741+
{
742+
return;
743+
}
744+
previousPosition = iterator.next();
745+
746+
while (iterator.hasNext())
747+
{
748+
position = iterator.next();
749+
if (" ".equals(position.getUnicode()) && previousPosition.completelyContains(position))
750+
{
751+
iterator.remove();
752+
}
753+
else
754+
{
755+
previousPosition = position;
756+
}
757+
}
758+
}
759+
727760
/**
728761
* Write the line separator value to the output stream.
729762
*

pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,55 @@ else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
607607
return true;
608608
}
609609

610+
/**
611+
* Determine if this TextPosition perfectly contains another (i.e. the other TextPosition
612+
* overlaps 100% with this one and fits entirely inside its bounding box when they are rendered
613+
* on top of each other).
614+
*
615+
* @param tp2 The other TestPosition to compare against
616+
* @return True if tp2 is contained completely inside the bounding box of this text.
617+
*/
618+
public boolean completelyContains(TextPosition tp2)
619+
{
620+
// Note: (0, 0) is in the upper left and y-coordinate is top of TextPosition
621+
// +---thisTop------------+
622+
// | +--tp2Top---+ |
623+
// | | | |
624+
// thisLeft | tp2Right |
625+
// | tp2Left | thisRight
626+
// | | | |
627+
// | +-tp2Bottom-+ |
628+
// +---------thisBottom---+
629+
630+
float thisLeft = getXDirAdj();
631+
float thisWidth = getWidthDirAdj();
632+
float thisRight = thisLeft + thisWidth;
633+
634+
float tp2Left = tp2.getXDirAdj();
635+
float tp2Width = tp2.getWidthDirAdj();
636+
float tp2Right = tp2Left + tp2Width;
637+
638+
if (thisLeft > tp2Left || tp2Right > thisRight)
639+
{
640+
return false;
641+
}
642+
643+
float thisTop = getYDirAdj();
644+
float thisHeight = getHeightDir();
645+
float thisBottom = thisTop + thisHeight;
646+
647+
float tp2Top = tp2.getYDirAdj();
648+
float tp2Height = tp2.getHeightDir();
649+
float tp2Bottom = tp2Top + tp2Height;
650+
651+
if (thisTop > tp2Top || tp2Bottom > thisBottom)
652+
{
653+
return false;
654+
}
655+
656+
return true;
657+
}
658+
610659
/**
611660
* Merge a single character TextPosition into the current object. This is to be used only for
612661
* cases where we have a diacritic that overlaps an existing TextPosition. In a graphical

0 commit comments

Comments
 (0)