Skip to content

Commit 8c91af8

Browse files
Snipxitext-teamcity
authored andcommitted
Use /ToUnicode mappings to several chars during text extraction from simple fonts
Refactor existing PdfFont#decode(PdfString) and PdfFont#getContentWidth(PdfString) methods DEVSIX-1681 Autoported commit. Original commit hash: [9a3157e04]
1 parent 15f9f5b commit 8c91af8

File tree

11 files changed

+2090
-1121
lines changed

11 files changed

+2090
-1121
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
using System;
2+
using iText.Kernel.Pdf;
3+
using iText.Kernel.Pdf.Canvas.Parser.Listener;
4+
using iText.Test;
5+
6+
namespace iText.Kernel.Pdf.Canvas.Parser {
7+
public class SimpleFontToUnicodeExtractionTest : ExtendedITextTest {
8+
private static readonly String sourceFolder = iText.Test.TestUtil.GetParentProjectDirectory(NUnit.Framework.TestContext
9+
.CurrentContext.TestDirectory) + "/resources/itext/kernel/parser/SimpleFontToUnicodeExtractionTest/";
10+
11+
/// <exception cref="System.IO.IOException"/>
12+
[NUnit.Framework.Test]
13+
public virtual void Test01() {
14+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode01.pdf"));
15+
String expected = "Information plays a central role in soci-\n" + "ety today, and it is becoming more and \n"
16+
+ "more common for that information to \n" + "be offered in digital form alone. The re-\n" + "liable, user-friendly Portable Document \n"
17+
+ "Format (PDF) has become the world’s \n" + "file type of choice for providing infor-\n" + "mation as a digital document. \n"
18+
+ "Tags can be added to a PDF in order \n" + "to structure the content of a document. \n" + "These tags are a critical requirement if \n"
19+
+ "any form of assistive technology (such \n" + "as screen readers, specialist mice, and \n" + "speech recognition and text-to-speech \n"
20+
+ "software) is to gain access to this con-\n" + "tent. To date, PDF documents have rare-\n" + "ly been tagged, and not all software can \n"
21+
+ "make use of PDF tags. In practical terms, \n" + "this particularly reduces information‘s \n" + "accessibility for people with disabilities \n"
22+
+ "who rely on assistive technology.";
23+
String actualText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(1), new LocationTextExtractionStrategy
24+
());
25+
NUnit.Framework.Assert.AreEqual(expected, actualText);
26+
}
27+
28+
/// <exception cref="System.IO.IOException"/>
29+
[NUnit.Framework.Test]
30+
public virtual void Test02() {
31+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode02.pdf"));
32+
String expected = "ffaast";
33+
String actualText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(1), new LocationTextExtractionStrategy
34+
());
35+
NUnit.Framework.Assert.AreEqual(expected, actualText);
36+
}
37+
}
38+
}

itext.tests/itext.kernel.tests/itext/kernel/pdf/canvas/parser/TextRenderInfoTest.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public virtual void TestCharacterRenderInfos() {
7979
/// </remarks>
8080
/// <exception cref="System.Exception"/>
8181
[NUnit.Framework.Test]
82-
[LogMessage(iText.IO.LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE)]
82+
[LogMessage(iText.IO.LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, Count = 2)]
8383
public virtual void TestUnicodeEmptyString() {
8484
StringBuilder sb = new StringBuilder();
8585
String inFile = "japanese_text.pdf";

0 commit comments

Comments
 (0)