Skip to content

Commit ef02a17

Browse files
committed
Improved PDF text parsing
1 parent 53c7231 commit ef02a17

File tree

4 files changed

+66
-8
lines changed

4 files changed

+66
-8
lines changed

js/containers/fontContainer.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ export class FontCont {
412412
family = 'NimbusSans';
413413
} else if (/Arial/i.test(family)) {
414414
family = 'NimbusSans';
415-
} else if (/Century/i.test(family)) {
415+
} else if (/CenturySch/i.test(family)) {
416416
family = 'Century';
417417
} else if (/Palatino/i.test(family)) {
418418
family = 'Palatino';

js/import/convertPageStext.js

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
calcBboxUnion,
55
calcBoxOverlap,
66
calcLang,
7+
cleanFamilyName,
78
mean50,
89
round6,
910
unescapeXml,
@@ -33,6 +34,9 @@ export async function convertPageStext({ ocrStr, n }) {
3334
/** @type {Set<string>} */
3435
const langSet = new Set();
3536

37+
/** @type {Set<string>} */
38+
const fontSet = new Set();
39+
3640
function convertParStext(xmlPar) {
3741
/** @type {Array<OcrLine>} */
3842
const parLineArr = [];
@@ -245,7 +249,7 @@ export async function convertPageStext({ ocrStr, n }) {
245249

246250
if (wordCharOrFontArr[i].length === 0) continue;
247251

248-
let wordInit = false;
252+
let wordCharN = 0;
249253

250254
for (let j = 0; j < wordCharOrFontArr[i].length; j++) {
251255
const charOrFont = wordCharOrFontArr[i][j];
@@ -257,7 +261,7 @@ export async function convertPageStext({ ocrStr, n }) {
257261
// (2) Runs of small caps that start with lower-case letters, which do not conform to the expectation that runs of small caps start with a capital letter.
258262
const sizePrevRaw = sizeCurrentRaw;
259263
sizeCurrentRaw = charOrFont.size;
260-
const secondLetter = wordInit && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
264+
const secondLetter = wordCharN > 0 && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
261265

262266
let baselineNextLetter;
263267
const possibleNextLetter1 = wordCharOrFontArr[i][j + 1];
@@ -356,8 +360,6 @@ export async function convertPageStext({ ocrStr, n }) {
356360
smallCapsWord = smallCapsCurrent;
357361

358362
if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /-it$/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
359-
// The word is already initialized, so we need to change the last element of the style array.
360-
// Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
361363
italicCurrent = true;
362364
} else {
363365
italicCurrent = false;
@@ -374,11 +376,13 @@ export async function convertPageStext({ ocrStr, n }) {
374376
baselineCurrent = charOrFont.origin.y;
375377
}
376378

377-
if (!wordInit) {
379+
// This condition should make a word italic if the characters themselves are italic,
380+
// even if leading/trailing punctuation is not italic.
381+
if (wordCharN === 0 || wordCharN < 3 && /[A-Z\d]/i.test(charOrFont.text)) {
378382
boldWord = boldCurrent;
379383
italicWord = italicCurrent;
380384

381-
wordInit = true;
385+
wordCharN++;
382386
}
383387

384388
let bbox;
@@ -598,6 +602,8 @@ export async function convertPageStext({ ocrStr, n }) {
598602

599603
wordObj.style.font = fontFamilyArr[i];
600604

605+
fontSet.add(cleanFamilyName(wordObj.style.font));
606+
601607
wordObj.style.sup = superArr[i];
602608

603609
wordObj.style.underline = underlineArr[i];
@@ -683,5 +689,7 @@ export async function convertPageStext({ ocrStr, n }) {
683689
});
684690
}
685691

686-
return { pageObj, dataTables: dataTablePage, langSet };
692+
return {
693+
pageObj, dataTables: dataTablePage, langSet, fontSet,
694+
};
687695
}

js/utils/miscUtils.js

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,3 +527,47 @@ export const getStyleLookup = (style) => {
527527
}
528528
return styleStr;
529529
};
530+
531+
/**
532+
* Identify specific font families and return a standardized name.
533+
* This function is not intended to map names to fonts supported by this program,
534+
* but rather simply to normalize the names of common fonts to a single name.
535+
*/
536+
export const cleanFamilyName = (family) => {
537+
let familyClean = family;
538+
if (/NimbusRom/i.test(family)) {
539+
familyClean = 'NimbusRoman';
540+
} else if (/TimesNewRom/i.test(family)) {
541+
familyClean = 'TimesNewRoman';
542+
} else if (/NimbusSan/i.test(family)) {
543+
familyClean = 'NimbusSans';
544+
} else if (/Helvetica/i.test(family)) {
545+
familyClean = 'Helvetica';
546+
} else if (/Arial/i.test(family)) {
547+
familyClean = 'Arial';
548+
} else if (/CenturySch/i.test(family)) {
549+
familyClean = 'CenturySchoolbook';
550+
} else if (/Palatino/i.test(family)) {
551+
familyClean = 'Palatino';
552+
} else if (/Garamond/i.test(family)) {
553+
familyClean = 'Garamond';
554+
} else if (/CenturyGothic/i.test(family)) {
555+
familyClean = 'CenturyGothic';
556+
} else if (/AvantGarde/i.test(family)) {
557+
familyClean = 'AvantGarde';
558+
} else if (/Carlito/i.test(family)) {
559+
familyClean = 'Carlito';
560+
} else if (/Calibri/i.test(family)) {
561+
familyClean = 'Calibri';
562+
} else if (/Courier/i.test(family)) {
563+
familyClean = 'Courier';
564+
} else if (/NimbusMono/i.test(family)) {
565+
familyClean = 'NimbusMono';
566+
} else if (/Dingbats/i.test(family)) {
567+
familyClean = 'Dingbats';
568+
} else if (/Wingdings/i.test(family)) {
569+
familyClean = 'Wingdings';
570+
}
571+
572+
return familyClean;
573+
};

tests/module/importPdfText.spec.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,12 @@ describe('Check that font style is detected for PDF imports.', function () {
410410
assert.isFalse(scribe.data.ocr.active[5].lines[22].words[4].style.underline);
411411
}).timeout(10000);
412412

413+
it('Italic style is detected when leading punctuation is non-italic', async () => {
414+
await scribe.importFiles([`${ASSETS_PATH_KARMA}/high-risk_protection_order_application_for_and_declaration_in_support_of_mandatory_use.pdf`]);
415+
assert.strictEqual(scribe.data.ocr.active[0].lines[15].words[1].text, '(Print');
416+
assert.isTrue(scribe.data.ocr.active[0].lines[15].words[1].style.italic);
417+
}).timeout(10000);
418+
413419
it('Bold + italic style is detected', async () => {
414420
await scribe.importFiles([`${ASSETS_PATH_KARMA}/complaint_1.pdf`]);
415421
assert.isTrue(scribe.data.ocr.active[0].lines[1].words[0].style.italic);

0 commit comments

Comments
 (0)