Skip to content

Commit 3fc0f67

Browse files
committed
PDFBOX-5230: make zero-width non-joiner characters invisible, by Daniel Gredler
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1924224 13f79535-47bb-0310-9956-ffa450edef68
1 parent 907030f commit 3fc0f67

File tree

5 files changed

+170
-5
lines changed

5 files changed

+170
-5
lines changed

fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ public class GlyfSimpleDescript extends GlyfDescript
5151
super(numberOfContours);
5252

5353
/*
54-
* https://developer.apple.com/fonts/TTRefMan/RM06/Chap6glyf.html
54+
* https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6glyf.html
5555
* "If a glyph has zero contours, it need not have any glyph data." set the pointCount to zero to initialize
5656
* attributes and avoid nullpointer but maybe there shouldn't have GlyphDescript in the GlyphData?
5757
*/

fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.nio.charset.StandardCharsets;
2727
import java.util.Calendar;
2828
import java.util.HashMap;
29+
import java.util.HashSet;
2930
import java.util.Iterator;
3031
import java.util.LinkedHashMap;
3132
import java.util.List;
@@ -52,7 +53,7 @@ public final class TTFSubsetter
5253
{
5354
private static final Logger LOG = LogManager.getLogger(TTFSubsetter.class);
5455

55-
private static final byte[] PAD_BUF = { 0, 0, 0 };
56+
private static final byte[] PAD_BUF = { 0, 0, 0, 0 };
5657

5758
private static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC"); // clone before using
5859

@@ -62,6 +63,7 @@ public final class TTFSubsetter
6263

6364
private final List<String> keepTables;
6465
private final SortedSet<Integer> glyphIds; // new glyph ids
66+
private final Set<Integer> invisibleGlyphIds;
6567
private String prefix;
6668
private boolean hasAddedCompoundReferences;
6769

@@ -92,6 +94,7 @@ public TTFSubsetter(TrueTypeFont ttf, List<String> tables) throws IOException
9294

9395
uniToGID = new TreeMap<>();
9496
glyphIds = new TreeSet<>();
97+
invisibleGlyphIds = new HashSet<>();
9598

9699
// find the best Unicode cmap
97100
this.unicodeCmap = ttf.getUnicodeCmapLookup();
@@ -135,6 +138,23 @@ public void addAll(Set<Integer> unicodeSet)
135138
unicodeSet.forEach(this::add);
136139
}
137140

141+
/**
142+
* Forces the glyph for the specified character code to be zero-width and contour-free,
143+
* regardless of what the glyph looks like in the original font. Note that the specified
144+
* character code is not added to the subset unless it is also {@link #add(int) added}
145+
* separately.
146+
*
147+
* @param unicode the character code whose glyph should be invisible
148+
*/
149+
public void forceInvisible(int unicode)
150+
{
151+
int gid = unicodeCmap.getGlyphId(unicode);
152+
if (gid != 0)
153+
{
154+
invisibleGlyphIds.add(gid);
155+
}
156+
}
157+
138158
/**
139159
* Returns the map of new -&gt; old GIDs.
140160
*
@@ -611,6 +631,13 @@ private byte[] buildGlyfTable(long[] newOffsets) throws IOException
611631
isResult);
612632
}
613633

634+
// glyphs with no outlines have an empty entry in the 'glyf' table, with a
635+
// corresponding 'loca' table entry with length = 0
636+
if (invisibleGlyphIds.contains(gid))
637+
{
638+
continue;
639+
}
640+
614641
byte[] buf = new byte[(int)length];
615642
isResult = is.read(buf);
616643

@@ -921,9 +948,18 @@ private byte[] buildHmtxTable() throws IOException
921948
long offset;
922949
if (glyphId <= lastgid)
923950
{
924-
// copy width and lsb
925-
offset = glyphId * 4l;
926-
lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
951+
if (invisibleGlyphIds.contains(glyphId))
952+
{
953+
// force zero width (no change to last offset)
954+
// 4 bytes total, 2 bytes each for: advance width = 0, left side bearing = 0
955+
bos.write(PAD_BUF, 0, 4);
956+
}
957+
else
958+
{
959+
// copy width and lsb
960+
offset = glyphId * 4l;
961+
lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
962+
}
927963
}
928964
else
929965
{

fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import static org.junit.jupiter.api.Assertions.assertEquals;
3131
import static org.junit.jupiter.api.Assertions.assertFalse;
32+
import static org.junit.jupiter.api.Assertions.assertNotEquals;
3233
import static org.junit.jupiter.api.Assertions.assertNotNull;
3334
import static org.junit.jupiter.api.Assertions.assertNull;
3435
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -294,4 +295,76 @@ void testPDFBox5728() throws IOException
294295
subsetter.writeToStream(output);
295296
}
296297
}
298+
299+
/**
300+
* Test of PDFBOX-5230: check that subsetting can be forced to use invisible glyphs.
301+
*
302+
* @throws java.io.IOException
303+
*/
304+
@Test
305+
void testPDFBox5230() throws IOException
306+
{
307+
final File testFile = new File("src/test/resources/ttf/LiberationSans-Regular.ttf");
308+
TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBufferedFile(testFile));
309+
TTFSubsetter ttfSubsetter = new TTFSubsetter(ttf);
310+
ttfSubsetter.add('A');
311+
ttfSubsetter.add('B');
312+
ttfSubsetter.add('\u200C');
313+
314+
// verify results without forcing
315+
316+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
317+
ttfSubsetter.writeToStream(baos);
318+
try (TrueTypeFont subset = new TTFParser(true)
319+
.parse(new RandomAccessReadBuffer(baos.toByteArray())))
320+
{
321+
assertEquals(4, subset.getNumberOfGlyphs());
322+
assertEquals(0, subset.nameToGID(".notdef"));
323+
assertEquals(1, subset.nameToGID("A"));
324+
assertEquals(2, subset.nameToGID("B"));
325+
assertEquals(3, subset.nameToGID("uni200C"));
326+
327+
PostScriptTable pst = subset.getPostScript();
328+
assertEquals(".notdef", pst.getName(0));
329+
assertEquals("A", pst.getName(1));
330+
assertEquals("B", pst.getName(2));
331+
assertEquals("uni200C", pst.getName(3));
332+
333+
assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path should not be empty");
334+
assertFalse(subset.getPath("B").getBounds2D().isEmpty(), "B path should not be empty");
335+
assertFalse(subset.getPath("uni200C").getBounds2D().isEmpty(), "ZWNJ path should not be empty");
336+
assertNotEquals(0, subset.getWidth("A"), "A width should not be zero.");
337+
assertNotEquals(0, subset.getWidth("B"), "B width should not be zero.");
338+
assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be zero");
339+
}
340+
341+
// verify results while forcing B and ZWNJ to use invisible glyphs
342+
343+
ttfSubsetter.forceInvisible('B');
344+
ttfSubsetter.forceInvisible('\u200C');
345+
ByteArrayOutputStream baos2 = new ByteArrayOutputStream();
346+
ttfSubsetter.writeToStream(baos2);
347+
try (TrueTypeFont subset = new TTFParser(true)
348+
.parse(new RandomAccessReadBuffer(baos2.toByteArray())))
349+
{
350+
assertEquals(4, subset.getNumberOfGlyphs());
351+
assertEquals(0, subset.nameToGID(".notdef"));
352+
assertEquals(1, subset.nameToGID("A"));
353+
assertEquals(2, subset.nameToGID("B"));
354+
assertEquals(3, subset.nameToGID("uni200C"));
355+
356+
PostScriptTable pst = subset.getPostScript();
357+
assertEquals(".notdef", pst.getName(0));
358+
assertEquals("A", pst.getName(1));
359+
assertEquals("B", pst.getName(2));
360+
assertEquals("uni200C", pst.getName(3));
361+
362+
assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path should not be empty");
363+
assertTrue(subset.getPath("B").getBounds2D().isEmpty(), "B path should be empty");
364+
assertTrue(subset.getPath("uni200C").getBounds2D().isEmpty(), "ZWNJ path should be empty");
365+
assertNotEquals(0, subset.getWidth("A"), "A width should not be zero.");
366+
assertEquals(0, subset.getWidth("B"), "B width should be zero.");
367+
assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be zero");
368+
}
369+
}
297370
}

pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,10 @@ public void subset() throws IOException
326326
// set the GIDs to subset
327327
TTFSubsetter subsetter = new TTFSubsetter(ttf, tables);
328328
subsetter.addAll(subsetCodePoints);
329+
subsetter.forceInvisible('\u200B'); // ZWSP
330+
subsetter.forceInvisible('\u200C'); // ZWNJ
331+
subsetter.forceInvisible('\u2060'); // WJ
332+
subsetter.forceInvisible('\uFEFF'); // ZWNBSP
329333

330334
if (!allGlyphIds.isEmpty())
331335
{

pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.apache.pdfbox.cos.COSName;
3636
import org.apache.pdfbox.pdmodel.PDDocument;
3737
import org.apache.pdfbox.pdmodel.PDPage;
38+
import org.apache.pdfbox.pdmodel.PDResources;
3839
import org.apache.pdfbox.pdmodel.common.PDRectangle;
3940
import org.apache.pdfbox.pdmodel.PDPageContentStream;
4041
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
@@ -692,4 +693,55 @@ void testSurrogatePairCharacterExceptionIsValidCodePoint() throws IOException
692693
fail();
693694
}
694695
}
696+
697+
/**
698+
* PDFBOX-5230: Zero-width characters should be invisible.
699+
*
700+
* @throws IOException
701+
*/
702+
@Test
703+
void testEmbeddedFontWithZeroWidthChars() throws IOException
704+
{
705+
String text = "AAA\u200CBBB";
706+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
707+
try (PDDocument document = new PDDocument())
708+
{
709+
PDPage page = new PDPage();
710+
document.addPage(page);
711+
InputStream input = PDFont.class.getResourceAsStream(
712+
"/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
713+
PDType0Font font = PDType0Font.load(document, input);
714+
try (PDPageContentStream stream = new PDPageContentStream(document, page))
715+
{
716+
stream.beginText();
717+
stream.setFont(font, 20);
718+
stream.newLineAtOffset(50, 600);
719+
stream.showText(text);
720+
stream.endText();
721+
}
722+
document.save(baos);
723+
}
724+
try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
725+
{
726+
// verify that the text still contains zero-width characters
727+
PDFTextStripper stripper = new PDFTextStripper();
728+
String extractedText = stripper.getText(document).trim();
729+
assertEquals(text, extractedText);
730+
assertEquals(7, extractedText.length());
731+
assertEquals('\u200C', extractedText.charAt(3));
732+
733+
// verify that the zero-width characters are invisible
734+
PDPage page = document.getPage(0);
735+
PDResources resources = page.getResources();
736+
Iterable< COSName > fontNames = resources.getFontNames();
737+
COSName fontName = fontNames.iterator().next();
738+
PDType0Font font = (PDType0Font) resources.getFont(fontName);
739+
byte[] encoded = font.encode('\u200C');
740+
int code = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
741+
assertEquals(0, font.getWidth(code));
742+
assertEquals(0, font.getWidthFromFont(code));
743+
assertTrue(font.getPath(code).getBounds2D().isEmpty());
744+
assertFalse(font.isDamaged());
745+
}
746+
}
695747
}

0 commit comments

Comments
 (0)