diff --git a/base/core/evaluator.js b/base/core/evaluator.js index 02882e7..9f61fc1 100755 --- a/base/core/evaluator.js +++ b/base/core/evaluator.js @@ -506,19 +506,84 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { var fontResources = font.get('Resources') || resources; var charProcKeys = Object.keys(charProcs); var charProcOperatorList = {}; + + info(`Processing Type3 font: ${fontName}, found ${charProcKeys.length} CharProcs`); + + // Create a mapping from character code to glyph name + var charProcMapping = {}; + var encoding = font.get('Encoding'); + + if (encoding) { + info(`Type3 font has encoding: ${encoding.name || 'custom'}`); + var differences = encoding.get('Differences'); + var baseEncoding = encoding.get('BaseEncoding'); + + // Process Differences array if it exists + if (differences) { + info(`Processing Differences array of length ${differences.length}`); + var currentCode = 0; + for (var i = 0; i < differences.length; i++) { + var entry = differences[i]; + if (typeof entry === 'number') { + currentCode = entry; + info(`Setting current code to ${currentCode}`); + } else { + // Check the type of entry to debug what's happening + var entryType = typeof entry; + var entryValue; + + // Ensure we always get a string name (not an object) + if (entryType === 'object' && entry.name) { + entryValue = entry.name; + } else if (entryType === 'object') { + entryValue = JSON.stringify(entry); + info(`Warning: Non-name object in Differences array: ${entryValue}`); + } else { + entryValue = entry.toString(); + } + + // info(`Entry type: ${entryType}, value: ${entryValue}`); + + charProcMapping[currentCode] = entryValue; + // info(`Mapped code ${currentCode} to glyph '${entryValue}'`); + currentCode++; + } + } + } + // Use BaseEncoding if available + if (baseEncoding && baseEncoding.name) { + info(`Using BaseEncoding: ${baseEncoding.name}`); + var baseEncodingMap = Encodings[baseEncoding.name]; + if (baseEncodingMap) { + for (var code = 0; code < 256; code++) { + if (!charProcMapping[code] && baseEncodingMap[code]) { + charProcMapping[code] = baseEncodingMap[code]; + // info(`Mapped code ${code} to glyph '${baseEncodingMap[code]}' from BaseEncoding`); + } + } + } + } + } + + // Store the mapping in the font object for text extraction + font.translated.charProcMapping = charProcMapping; + // info(`Final charProcMapping has ${Object.keys(charProcMapping).length} entries`); + for (var i = 0, n = charProcKeys.length; i < n; ++i) { var key = charProcKeys[i]; var glyphStream = charProcs[key]; var operatorList = this.getOperatorList(glyphStream, fontResources); charProcOperatorList[key] = operatorList.getIR(); + // info(`Processed CharProc for glyph '${key}'`); if (!parentOperatorList) { continue; } // Add the dependencies to the parent operator list so they are // resolved before sub operator list is executed synchronously. - parentOperatorList.addDependencies(charProcOperatorList.dependencies); + parentOperatorList.addDependencies(operatorList.dependencies); } font.translated.charProcOperatorList = charProcOperatorList; + font.translated.charProcMapping = charProcMapping; font.loaded = true; } else { font.loaded = true; diff --git a/base/core/fonts.js b/base/core/fonts.js index 6cc8433..579f2f4 100755 --- a/base/core/fonts.js +++ b/base/core/fonts.js @@ -2185,11 +2185,40 @@ var Font = (function FontClosure() { this.cmap = properties.cmap; this.fontMatrix = properties.fontMatrix; - if (properties.type == 'Type3') { - this.encoding = properties.baseEncoding; - return; + if (properties.type == 'Type3') { + this.encoding = properties.baseEncoding; + this.disableFontFace = true; + this.loadedName = this.loadedName || 'Type3Font'; + + // Add ability to map Type3 font glyphs to Unicode characters + if (properties.toUnicode) { + this.toUnicode = properties.toUnicode; + } else { + // Create a basic toUnicode map for common glyph names + const toUnicode = {}; + const encoding = properties.baseEncoding || []; + for (let i = 0; i < encoding.length; i++) { + const glyphName = encoding[i]; + if (glyphName && GlyphsUnicode[glyphName]) { + toUnicode[i] = String.fromCharCode(GlyphsUnicode[glyphName]); + } + } + + // If there are differences, apply them too + if (properties.differences && properties.differences.length) { + for (let i = 0; i < 256; i++) { + if (properties.differences[i]) { + const glyphName = properties.differences[i]; + if (typeof glyphName === 'string' && GlyphsUnicode[glyphName]) { + toUnicode[i] = String.fromCharCode(GlyphsUnicode[glyphName]); + } + } + } + } + this.toUnicode = toUnicode; + } + return; } - // Trying to fix encoding using glyph CIDSystemInfo. this.loadCidToUnicode(properties); this.cidEncoding = properties.cidEncoding; @@ -4494,7 +4523,43 @@ var Font = (function FontClosure() { case 'Type3': var glyphName = this.differences[charcode] || this.encoding[charcode]; operatorList = this.charProcOperatorList[glyphName]; - fontCharCode = charcode; + + // For text extraction, map the glyph name to Unicode if possible + if (glyphName) { + fontCharCode = GlyphsUnicode[glyphName] || charcode; + + // Handle common symbolic glyphs + if (fontCharCode === charcode && typeof glyphName === 'string') { + // Special handling for specific glyphs + if (glyphName.startsWith('uni')) { + // Handle uniXXXX format + const hex = glyphName.substring(3); + if (/^[0-9A-F]{4,6}$/i.test(hex)) { + fontCharCode = parseInt(hex, 16); + } + } + + // Check if it's a common symbol + const commonSymbols = { + 'bullet': 0x2022, + 'checkbox': 0x2610, + 'checkmark': 0x2713, + 'circle': 0x25CB, + 'square': 0x25A1, + 'triangle': 0x25B2, + 'triangledown': 0x25BC, + 'triangleleft': 0x25C0, + 'triangleright': 0x25B6, + 'star': 0x2605 + }; + + if (commonSymbols[glyphName.toLowerCase()]) { + fontCharCode = commonSymbols[glyphName.toLowerCase()]; + } + } + } else { + fontCharCode = charcode; + } break; case 'TrueType': if (this.useToFontChar) { diff --git a/base/display/canvas.js b/base/display/canvas.js index 43b2e42..b48cae6 100755 --- a/base/display/canvas.js +++ b/base/display/canvas.js @@ -903,8 +903,10 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { this.current.fontSize = size; if (fontObj.coded) { - warn('Unsupported Type3 font (custom Glyph) - ' + fontRefName); - return; // we don't need ctx.font for Type3 fonts + warn('Found Type3 font (custom Glyph) - ' + fontRefName + ', trying to decode'); // MQZ 8/23 added Type3 glyph font support + // MQZ. 08/24/2025 need to set up the font context for glyph based text processing + this.ctx.setFont(fontObj); + return; // we don't need ctx.font for Type3 fonts } var name = fontObj.loadedName || 'sans-serif'; @@ -1053,13 +1055,36 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var glyphsLength = glyphs.length; var textLayer = this.textLayer; var geom; - var textSelection = textLayer && !skipTextSelection ? true : false; + + // Always use textSelection for Type3 fonts + var textSelection = textLayer && (font.coded || !skipTextSelection) ? true : false; + var type3Text = ""; + var canvasWidth = 0.0; var vertical = font.vertical; var defaultVMetrics = font.defaultVMetrics; + info(`showText called with ${glyphsLength} glyphs, font type: ${font.coded ? 'Type3' : font.type || 'Unknown'}, textSelection: ${textSelection}`); + // Type3 fonts - each glyph is a "mini-PDF" if (font.coded) { + info(`Processing Type3 font with ${glyphsLength} glyphs`); + + // For Type3 fonts, collect unicode characters or character codes + for (var i = 0; i < glyphsLength; ++i) { + var glyph = glyphs[i]; + if (glyph !== null) { + // Use unicode value if available, otherwise use fontChar + if (glyph.unicode) { + type3Text += glyph.unicode; + } else if (glyph.fontChar) { + type3Text += String.fromCharCode(glyph.fontChar); + } + } + } + info(`Type3 text: ${type3Text}`); + + // If we have collected text, store it for later use in appendText ctx.save(); ctx.transform.apply(ctx, current.textMatrix); ctx.translate(current.x, current.y); @@ -1070,18 +1095,22 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { this.save(); ctx.scale(1, -1); geom = this.createTextGeometry(); + // Add the Type3 text to the geometry object so it can be added to the output + geom.type3Text = type3Text; + geom.fontSize = fontSize; this.restore(); } for (var i = 0; i < glyphsLength; ++i) { - var glyph = glyphs[i]; if (glyph === null) { // word break + info(`Type3 word break at glyph ${i}`); this.ctx.translate(wordSpacing, 0); current.x += wordSpacing * textHScale; continue; } + //info(`Processing Type3 glyph ${i}: ${glyph.unicode || glyph.fontChar}`); this.processingType3 = glyph; this.save(); ctx.scale(fontSize, fontSize); @@ -1093,24 +1122,46 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var width = (transformed[0] * fontSize + charSpacing) * current.fontDirection; + //info(`Type3 glyph width: ${width}`); ctx.translate(width, 0); current.x += width * textHScale; canvasWidth += width; } + // Render Type3 text within the transformation context + if (type3Text) { + info(`render Type3 text: '${type3Text}', disableFontFace: ${font.disableFontFace}`); + var curFontSize = fontSize; + switch (current.textRenderingMode) { + case TextRenderingMode.FILL: + ctx.fillText(type3Text, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.STROKE: + ctx.strokeText(type3Text, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.FILL_STROKE: + ctx.fillText(type3Text, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.INVISIBLE: + case TextRenderingMode.ADD_TO_PATH: + break; + default: // other unsupported rendering modes + } + } + ctx.restore(); this.processingType3 = null; } else { ctx.save(); -//MQZ Dec.04.2013 handles leading word spacing - var tx = 0; - if (wordSpacing !== 0) { - var firstGlyph = glyphs.filter(g => g && ('fontChar' in g || 'unicode' in g))[0]; - if (firstGlyph && (firstGlyph.fontChar === ' ' || firstGlyph.unicode === ' ')) { + //MQZ Dec.04.2013 handles leading word spacing + var tx = 0; + if (wordSpacing !== 0) { + var firstGlyph = glyphs.filter(g => g && ('fontChar' in g || 'unicode' in g))[0]; + if (firstGlyph && (firstGlyph.fontChar === ' ' || firstGlyph.unicode === ' ')) { tx = wordSpacing * fontSize * textHScale; - } - } + } + } current.x += tx this.applyTextTransforms(); @@ -1135,8 +1186,8 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { ctx.lineWidth = lineWidth; -//MQZ. Feb.20.2013. Disable character based painting, make it a string - var str = ""; + //MQZ. Feb.20.2013. Disable character based painting, make it a string + var str = ""; var x = 0; for (var i = 0; i < glyphsLength; ++i) { @@ -1188,7 +1239,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { //MQZ. Feb.20.2013. Disable character based painting, make it a string // this.paintChar(character, scaledX, scaledY); - str += glyph.unicode || character; + str += glyph.unicode || character; if (accent) { scaledAccentX = scaledX + accent.offset.x / fontSizeScale; scaledAccentY = scaledY - accent.offset.y / fontSizeScale; @@ -1218,35 +1269,28 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { // info(nodeUtil.inspect(glyphs)); // } - if (str && !font.disableFontFace) { - var curFontSize = fontSize * scale * textHScale + 3; - switch (current.textRenderingMode) { - case TextRenderingMode.FILL: - ctx.fillText(str, 0, 0, canvasWidth, curFontSize); - break; - case TextRenderingMode.STROKE: - ctx.strokeText(str, 0, 0, canvasWidth, curFontSize); - break; - case TextRenderingMode.FILL_STROKE: - ctx.fillText(str, 0, 0, canvasWidth, curFontSize); - break; - case TextRenderingMode.INVISIBLE: - case TextRenderingMode.ADD_TO_PATH: - break; - default: // other unsupported rendering modes - } - } ctx.restore(); } - if (textSelection) { - geom.canvasWidth = canvasWidth; - if (vertical) { - var VERTICAL_TEXT_ROTATION = Math.PI / 2; - geom.angle += VERTICAL_TEXT_ROTATION; + // Text rendering for regular fonts (Type3 fonts are handled in their own context above) + if (str && !font.disableFontFace && !font.coded) { + var curFontSize = fontSize * scale * textHScale + 3; + switch (current.textRenderingMode) { + case TextRenderingMode.FILL: + ctx.fillText(str, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.STROKE: + ctx.strokeText(str, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.FILL_STROKE: + ctx.fillText(str, 0, 0, canvasWidth, curFontSize); + break; + case TextRenderingMode.INVISIBLE: + case TextRenderingMode.ADD_TO_PATH: + break; + default: // other unsupported rendering modes } - this.textLayer.appendText(geom); } return canvasWidth; @@ -1334,7 +1378,6 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var VERTICAL_TEXT_ROTATION = Math.PI / 2; geom.angle += VERTICAL_TEXT_ROTATION; } - this.textLayer.appendText(geom); } }, nextLineShowText: function CanvasGraphics_nextLineShowText(text) { diff --git a/lib/pdffont.js b/lib/pdffont.js index edf07e5..ce4b163 100644 --- a/lib/pdffont.js +++ b/lib/pdffont.js @@ -205,7 +205,7 @@ export default class PDFFont { if (this.typeName.indexOf('narrow') > 0) this.faceIdx = 1; } - // nodeUtil.p2jinfo"typeName = " + typeName + " => faceIdx = " + this.faceIdx); + nodeUtil.p2jinfo(`typeName = ${typeName} => faceIdx = ${this.faceIdx}`); } #getFontStyleIndex(fontSize) { @@ -289,14 +289,184 @@ export default class PDFFont { retVal = '\u2022'; break; //Bullet dot default: - nodeUtil.p2jinfo( - `${this.fontObj.type} - SymbolicFont - (${this.fontObj.name}) : ${str.charCodeAt(0)}::${str.charCodeAt(1)} => ${retVal}` - ); + nodeUtil.p2jinfo(`${this.fontObj.type} - SymbolicFont - (${this.fontObj.name}) : ${str.charCodeAt(0)}::${str.charCodeAt(1)} => ${retVal}`); } return retVal; } + #processType3Font(str) { + // Special handling for Type3 fonts + if (!str || str.length !== 1 || this.fontObj.type !== 'Type3') { + return str; + } + + // Debug info + nodeUtil.p2jinfo(`Processing Type3 font: char code = ${str.charCodeAt(0)}, char = '${str}'`); + if (this.fontObj.charProcMapping) { + nodeUtil.p2jinfo(`charProcMapping available with ${Object.keys(this.fontObj.charProcMapping).length} entries`); + } else { + nodeUtil.p2jinfo(`No charProcMapping available for this Type3 font`); + + // If no mapping is available, try to use the character directly + if (str && str.length === 1) { + const code = str.charCodeAt(0); + if (code >= 65 && code <= 90) { // A-Z + nodeUtil.p2jinfo(`Using direct uppercase letter: ${str}`); + return str; + } else if (code >= 97 && code <= 122) { // a-z + nodeUtil.p2jinfo(`Using direct lowercase letter: ${str}`); + return str; + } else if (code >= 48 && code <= 57) { // 0-9 + nodeUtil.p2jinfo(`Using direct digit: ${str}`); + return str; + } + } + } + + // Use the charProcMapping if available to map character code to glyph name + if (this.fontObj.charProcMapping) { + const charCode = str.charCodeAt(0); + const glyphName = this.fontObj.charProcMapping[charCode]; + + if (glyphName) { + nodeUtil.p2jinfo(`Found glyph name in mapping: ${glyphName}`); + // Map common Type3 glyph names to Unicode characters + const glyphToUnicode = { + 'bullet': '\u2022', + 'checkbox': '\u2610', + 'checkmark': '\u2713', + 'check': '\u2713', + 'circle': '\u25CB', + 'square': '\u25A1', + 'triangle': '\u25B2', + 'triangledown': '\u25BC', + 'triangleleft': '\u25C0', + 'triangleright': '\u25B6', + 'star': '\u2605', + 'diamond': '\u25C6', + 'heart': '\u2665', + 'club': '\u2663', + 'spade': '\u2660', + 'filledcircle': '\u25CF', + 'filledsquare': '\u25A0', + 'filledtriangle': '\u25B2', + 'filledtriangledown': '\u25BC', + 'filledtriangleright': '\u25B6', + 'filledtriangleleft': '\u25C0', + 'arrowleft': '\u2190', + 'arrowright': '\u2192', + 'arrowup': '\u2191', + 'arrowdown': '\u2193', + 'cross': '\u2717' + }; + + // Check for direct match + const glyphNameLower = typeof glyphName === 'string' ? glyphName.toLowerCase() : ''; + if (glyphNameLower in glyphToUnicode) { + const unicodeChar = glyphToUnicode[/** @type {keyof typeof glyphToUnicode} */ (glyphNameLower)]; + nodeUtil.p2jinfo(`Mapped ${glyphNameLower} to Unicode ${unicodeChar}`); + return unicodeChar; + } + + // Check for letters in the glyph name (g0, g1, etc.) + if (typeof glyphName === 'string' && glyphName.length > 1) { + const letterMatch = glyphName.match(/[A-Za-z]/g); + if (letterMatch && letterMatch.length === 1) { + const letter = letterMatch[0].toUpperCase(); + nodeUtil.p2jinfo(`Extracted letter ${letter} from glyph name ${glyphName}`); + return letter; + } + } + + // Check for partial match (glyph name contains known keyword) + for (const key in glyphToUnicode) { + if (glyphNameLower.indexOf(key) >= 0) { + const unicodeChar = glyphToUnicode[/** @type {keyof typeof glyphToUnicode} */ (key)]; + nodeUtil.p2jinfo(`Partial match: ${glyphNameLower} contains ${key}, mapped to ${unicodeChar}`); + return unicodeChar; + } + } + + // Try to match letters in the glyph name (e.g. g26 -> "C", g28 -> "O", etc.) + // Look for letter patterns in the glyph name + if (typeof glyphName === 'string') { + // Try to extract letter from glyph name + const letterMatch = glyphName.match(/[A-Za-z]/g); + if (letterMatch && letterMatch.length === 1) { + const letter = letterMatch[0].toUpperCase(); + nodeUtil.p2jinfo(`Extracted letter ${letter} from glyph name ${glyphName}`); + return letter; + } + + // Handle number in glyph name to suggest possible letter + const numberMatch = glyphName.match(/\d+/); + if (numberMatch && numberMatch.length === 1) { + const num = parseInt(numberMatch[0], 10); + // Map numbers to alphabet (1=A, 2=B, etc.) + if (num >= 1 && num <= 26) { + const letter = String.fromCharCode(64 + num); // ASCII 'A' is 65 + nodeUtil.p2jinfo(`Mapped number ${num} in glyph name ${glyphName} to letter ${letter}`); + return letter; + } + } + } + + // Handle uniXXXX format glyph names + if (typeof glyphName === 'string' && glyphName.startsWith('uni')) { + const hex = glyphName.substring(3); + if (/^[0-9A-F]{4,6}$/i.test(hex)) { + nodeUtil.p2jinfo(`Mapped uni${hex} to Unicode character`); + return String.fromCharCode(parseInt(hex, 16)); + } + } + } + } + + // If we reach here, try direct character code mapping + const charCode = str.charCodeAt(0); + + // No hard-coded directMappings, rely on charProcMapping from the font object + nodeUtil.p2jinfo(`No direct mapping for character code ${charCode}, checking general mappings`); + + + // Direct mapping for common Type3 glyph character codes + let result = str; + switch (charCode) { + case 18: result = '\u2713'; break; // Check mark + case 19: result = '\u2610'; break; // Ballot box + case 20: result = '\u2611'; break; // Ballot box with check + case 108: result = '\u2022'; break; // Bullet + case 109: result = '\u25CF'; break; // Black circle + case 110: result = '\u25CB'; break; // White circle + case 111: result = '\u25A0'; break; // Black square + case 112: result = '\u25A1'; break; // White square + case 113: result = '\u25B2'; break; // Black up-pointing triangle + case 114: result = '\u25BC'; break; // Black down-pointing triangle + case 117: result = '\u2190'; break; // Left arrow + case 118: result = '\u2192'; break; // Right arrow + case 119: result = '\u2191'; break; // Up arrow + case 120: result = '\u2193'; break; // Down arrow + case 128: result = '\u221E'; break; // Infinity + case 129: result = '\u2260'; break; // Not equal + case 130: result = '\u2264'; break; // Less than or equal + case 131: result = '\u2265'; break; // Greater than or equal + } + + if (result !== str) { + nodeUtil.p2jinfo(`Mapped char code ${charCode} to Unicode ${result}`); + } else { + nodeUtil.p2jinfo(`No mapping found for char code ${charCode}, returning original character`); + } + + return result; + } + + /** + * Calculate the rotation angle from a 2D transformation matrix + * @param {number[][]} matrix2D - The 2D transformation matrix + * @returns {number} - The rotation angle in degrees + */ #textRotationAngle(matrix2D) { let retVal = 0; if (matrix2D[0][0] === 0 && matrix2D[1][1] === 0) { @@ -314,11 +484,35 @@ export default class PDFFont { } // public instance methods + /** + * Process text for rendering + * @param {{x: number, y: number}} p - The position + * @param {string} str - The text string + * @param {number} maxWidth - Maximum width + * @param {string} color - Color value + * @param {number} fontSize - Font size + * @param {{Texts: Array}} targetData - Target data object + * @param {number[][]} matrix2D - 2D transformation matrix + */ processText(p, str, maxWidth, color, fontSize, targetData, matrix2D) { - const text = this.#processSymbolicFont(str); + // Debug the incoming text processing + nodeUtil.p2jinfo(`Processing text: '${str}', font type: ${this.fontObj.type || 'unknown'}, char code: ${str ? str.charCodeAt(0) : 'none'}`); + + // Save original text for fallback + const originalStr = str; + + // First try to process Type3 fonts, then fall back to symbolic fonts + let text = this.fontObj.type === 'Type3' ? + this.#processType3Font(str) : + this.#processSymbolicFont(str); + if (!text) { - return; + nodeUtil.p2jinfo('Text processing returned null or empty, falling back to original text'); + text = originalStr; // Use original text as fallback } + + nodeUtil.p2jinfo(`Processed text: '${str}' -> '${text}'`); + this.fontStyleId = this.#getFontStyleIndex(fontSize); // when this.fontStyleId === -1, it means the text style doesn't match any entry in the dictionary @@ -344,7 +538,8 @@ export default class PDFFont { const rAngle = this.#textRotationAngle(matrix2D); if (rAngle !== 0) { nodeUtil.p2jinfo(`${str}: rotated ${rAngle} degree.`); - textRun = { ...textRun, RA: rAngle }; + // Add RA property safely + textRun = Object.assign({}, textRun, { RA: rAngle }); } const oneText = { @@ -355,11 +550,18 @@ export default class PDFFont { sw: this.spaceWidth, //font space width, use to merge adjacent text blocks A: 'left', R: [textRun], + // TT: this.fontObj.isSymbolicFont || this.fontObj.type === 'Type3' ? 1 : 0, // Add TT flag for symbolic and Type3 fonts }; + nodeUtil.p2jinfo(`Adding text to output: '${text}'`); targetData.Texts.push(oneText); } + /** + * Encode text for output + * @param {string} str - The string to encode + * @returns {string} - The encoded string + */ flashEncode(str) { let retVal = encodeURIComponent(str); retVal = retVal.replace('%C2%96', '-'); diff --git a/package.json b/package.json index 1cb28f4..65b112f 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "test:jest": "jest --config ./jest.config.json --detectOpenHandles", "test": "npm run test:jest && npm run parse-r && npm run parse-fd && npm run test:deno && npm run test:bun", "test:forms": "cd ./test && sh p2j.forms.sh", - "test:misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 14 success, 6 fail exception with stack trace\" ", + "test:misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 15 success, 6 fail exception with stack trace\" ", "parse": "./bin/pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form", "parse-s": "./bin/pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s", "parse-t": "./bin/pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s -t", diff --git a/test/_test_type3glyph.cjs b/test/_test_type3glyph.cjs new file mode 100644 index 0000000..70b38ae --- /dev/null +++ b/test/_test_type3glyph.cjs @@ -0,0 +1,172 @@ +const fs = require('fs'); +const path = require('path'); +const PDFParser = require("../dist/pdfparser.cjs"); + +describe('Type3 Glyph Font Tests', () => { + let pdfParser; + const testPdfPath = path.join(__dirname, 'pdf/misc/i389_type3_glyph.pdf'); + const outputDir = path.join(__dirname, 'target/misc'); + const jsonOutputPath = path.join(outputDir, 'i389_type3_glyph.json'); + const contentOutputPath = path.join(outputDir, 'i389_type3_glyph.content.txt'); + + beforeEach(() => { + pdfParser = new PDFParser(null, 1); + }); + + afterEach(() => { + if (pdfParser) { + pdfParser.destroy(); + } + }); + + test('should successfully parse Type3 glyph font PDF', async () => { + return new Promise((resolve, reject) => { + // Set up event handlers + pdfParser.on('pdfParser_dataError', (errData) => { + reject(new Error(`PDF parsing failed: ${errData.parserError}`)); + }); + + pdfParser.on('pdfParser_dataReady', (pdfData) => { + try { + // Basic structure assertions + expect(pdfData).toBeDefined(); + expect(pdfData.Pages).toBeDefined(); + expect(pdfData.Pages.length).toBe(1); + + const page = pdfData.Pages[0]; + expect(page.Texts).toBeDefined(); + expect(page.Texts.length).toBe(2); // Should have both Type3 and regular text + + // Check for Type3 text "CONTENT" + const type3Text = page.Texts.find(text => + text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT' + ); + expect(type3Text).toBeDefined(); + expect(type3Text.R[0].T).toBe('CONTENT'); + + // Check for regular text "Added Text from Acrobat" + const regularText = page.Texts.find(text => + text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'Added Text from Acrobat' + ); + expect(regularText).toBeDefined(); + expect(decodeURIComponent(regularText.R[0].T)).toBe('Added Text from Acrobat'); + + console.log('✓ Type3 glyph font parsing successful'); + console.log(`✓ Found Type3 text: "${decodeURIComponent(type3Text.R[0].T)}"`); + console.log(`✓ Found regular text: "${decodeURIComponent(regularText.R[0].T)}"`); + + resolve(); + } catch (error) { + reject(error); + } + }); + + // Load and parse the PDF + pdfParser.loadPDF(testPdfPath); + }); + }, 30000); // 30 second timeout + + test('should generate correct output files with both texts', async () => { + // Ensure output directory exists + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + return new Promise((resolve, reject) => { + pdfParser.on('pdfParser_dataError', (errData) => { + reject(new Error(`PDF parsing failed: ${errData.parserError}`)); + }); + + pdfParser.on('pdfParser_dataReady', (pdfData) => { + try { + // Write JSON output + const jsonOutput = JSON.stringify(pdfData, null, 2); + fs.writeFileSync(jsonOutputPath, jsonOutput); + + // Write content output + let contentOutput = ''; + pdfData.Pages.forEach((page, pageIndex) => { + page.Texts.forEach(text => { + if (text.R) { + text.R.forEach(run => { + contentOutput += decodeURIComponent(run.T) + '\n'; + }); + } + }); + contentOutput += `----------------Page (${pageIndex}) Break----------------\n`; + }); + fs.writeFileSync(contentOutputPath, contentOutput); + + // Verify JSON file exists and contains both texts + expect(fs.existsSync(jsonOutputPath)).toBe(true); + const jsonContent = fs.readFileSync(jsonOutputPath, 'utf8'); + const parsedJson = JSON.parse(jsonContent); + + expect(parsedJson.Pages[0].Texts.length).toBe(2); + expect(jsonContent).toContain('CONTENT'); + expect(jsonContent).toContain('Added%20Text%20from%20Acrobat'); + + // Verify content file exists and contains both texts + expect(fs.existsSync(contentOutputPath)).toBe(true); + const contentFileContent = fs.readFileSync(contentOutputPath, 'utf8'); + expect(contentFileContent).toContain('CONTENT'); + expect(contentFileContent).toContain('Added Text from Acrobat'); + + console.log('✓ JSON output file created successfully'); + console.log('✓ Content output file created successfully'); + console.log('✓ Both files contain expected Type3 and regular text'); + + resolve(); + } catch (error) { + reject(error); + } + }); + + // Load and parse the PDF + pdfParser.loadPDF(testPdfPath); + }); + }, 30000); // 30 second timeout + + test('should handle Type3 font metadata correctly', async () => { + return new Promise((resolve, reject) => { + pdfParser.on('pdfParser_dataError', (errData) => { + reject(new Error(`PDF parsing failed: ${errData.parserError}`)); + }); + + pdfParser.on('pdfParser_dataReady', (pdfData) => { + try { + const page = pdfData.Pages[0]; + + // Find Type3 text + const type3Text = page.Texts.find(text => + text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT' + ); + + // Verify Type3 text has proper positioning + expect(type3Text.x).toBeDefined(); + expect(type3Text.y).toBeDefined(); + expect(typeof type3Text.x).toBe('number'); + expect(typeof type3Text.y).toBe('number'); + + // Verify text run structure + expect(type3Text.R).toBeDefined(); + expect(type3Text.R.length).toBe(1); + expect(type3Text.R[0].T).toBe('CONTENT'); + expect(type3Text.R[0].S).toBeDefined(); // Style index + expect(type3Text.R[0].TS).toBeDefined(); // Text style array + + console.log('✓ Type3 font metadata validation successful'); + console.log(`✓ Type3 text position: (${type3Text.x}, ${type3Text.y})`); + console.log(`✓ Type3 text style: S=${type3Text.R[0].S}, TS=[${type3Text.R[0].TS.join(',')}]`); + + resolve(); + } catch (error) { + reject(error); + } + }); + + // Load and parse the PDF + pdfParser.loadPDF(testPdfPath); + }); + }, 30000); // 30 second timeout +}); diff --git a/test/pdf/misc/i389_type3_glyph.pdf b/test/pdf/misc/i389_type3_glyph.pdf new file mode 100644 index 0000000..c73d882 Binary files /dev/null and b/test/pdf/misc/i389_type3_glyph.pdf differ