Skip to content

Commit 207335a

Browse files
committed
PDFBOX-5747: fix combining diacritics, by Richard Eckart de Castilho
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1923374 13f79535-47bb-0310-9956-ffa450edef68
1 parent 0b8bc2d commit 207335a

File tree

4 files changed

+15
-3
lines changed

4 files changed

+15
-3
lines changed

pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -759,16 +759,26 @@ private void insertDiacritic(int i, TextPosition diacritic)
759759
float[] widths2 = new float[widths.length + 1];
760760
System.arraycopy(widths, 0, widths2, 0, i);
761761

762+
// First we add a zero-width entry for the diacritic in the widths array
763+
widths2[i] = widths[i];
764+
widths2[i + 1] = 0;
765+
System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
766+
762767
// Unicode combining diacritics always go after the base character, regardless of whether
763768
// the string is in presentation order or logical order
764769
sb.append(unicode.charAt(i));
765-
widths2[i] = widths[i];
770+
771+
// If a surrogate starts at the current position, make sure we preserve it
772+
if (i < unicode.length() - 1 && Character.isSurrogatePair(unicode.charAt(i), unicode.charAt(i + 1)))
773+
{
774+
sb.append(unicode.charAt(i + 1));
775+
i++;
776+
}
777+
766778
sb.append(combineDiacritic(diacritic.getUnicode()));
767-
widths2[i + 1] = 0;
768779

769780
// get the rest of the string
770781
sb.append(unicode.substring(i + 1));
771-
System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
772782

773783
unicode = sb.toString();
774784
widths = widths2;
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
𝑋̂
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
𝑋̂

0 commit comments

Comments
 (0)