Skip to content

Commit 7b05aa9

Browse files
authored
fix: #385 [3.3.0 BREAKING CHANGE] removed encodeURIComponent and ensure utf8 extraction and output (#410)
Complete fix for issue #385 - encoding and partial character issues ## BREAKING CHANGE: Text in JSON output is no longer URI-encoded. Fixed two issues: 1. Removed URI encoding - Chinese/CJK characters now display as UTF-8 2. Fixed missing characters - glyphs marked as disabled now extracted correctly *. Removed encodeURIComponent() from flashEncode() *. Moved text extraction outside glyph.disabled check in canvas.js *. Added explicit UTF-8 encoding to all file operations *. Updated tests to work with direct text access *. Bumped version to 3.3.0
1 parent 5569bf7 commit 7b05aa9

File tree

7 files changed

+44
-34
lines changed

7 files changed

+44
-34
lines changed

base/display/canvas.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,15 +1239,19 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
12391239

12401240
//MQZ. Feb.20.2013. Disable character based painting, make it a string
12411241
// this.paintChar(character, scaledX, scaledY);
1242-
str += glyph.unicode || character;
12431242
if (accent) {
12441243
scaledAccentX = scaledX + accent.offset.x / fontSizeScale;
12451244
scaledAccentY = scaledY - accent.offset.y / fontSizeScale;
12461245
//MQZ. Feb.20.2013. Disable character based painting, make it a string
12471246
// this.paintChar(accent.fontChar, scaledAccentX, scaledAccentY);
1248-
// str += accent.fontChar;
12491247
}
12501248
}
1249+
1250+
// Always extract text for pdf2json, even if glyph is disabled for rendering (fixes issue #385)
1251+
str += glyph.unicode || character;
1252+
if (accent) {
1253+
// str += accent.fontChar; // Accent characters handled above
1254+
}
12511255

12521256
x += charWidth;
12531257

lib/parserstream.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ export class ParserStream extends Transform {
1111
}
1212

1313
static createOutputStream(outputPath, resolve, reject) {
14-
const outputStream = fs.createWriteStream(outputPath);
14+
const outputStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
1515
outputStream.on('finish', () => resolve(outputPath));
1616
outputStream.on('error', err => reject(err) );
1717
return outputStream;
@@ -71,7 +71,7 @@ export class StringifyStream extends Transform {
7171
}
7272

7373
_transform(obj, encoding, callback){
74-
this.push(JSON.stringify(obj));
74+
this.push(JSON.stringify(obj), 'utf8');
7575
callback();
7676
}
7777
}

lib/pdf.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ export default class PDFJSClass extends EventEmitter {
366366
const isDup = j > 0 && PDFFont.areDuplicateBlocks(page.Texts[j - 1], t);
367367
if (isDup) {
368368
PJS.info(
369-
`skipped: dup text block: ${decodeURIComponent(t.R[0].T)}`
369+
`skipped: dup text block: ${t.R[0].T}`
370370
);
371371
}
372372
return !isDup;
@@ -380,14 +380,14 @@ export default class PDFJSClass extends EventEmitter {
380380
PDFFont.areAdjacentBlocks(prevText, text) &&
381381
PDFFont.haveSameStyle(prevText, text)
382382
) {
383-
const preT = decodeURIComponent(prevText.R[0].T);
384-
const curT = decodeURIComponent(text.R[0].T);
383+
const preT = prevText.R[0].T;
384+
const curT = text.R[0].T;
385385

386386
prevText.R[0].T += text.R[0].T;
387387
prevText.w += text.w;
388388
text.merged = true;
389389

390-
const mergedText = decodeURIComponent(prevText.R[0].T);
390+
const mergedText = prevText.R[0].T;
391391
PJS.info(
392392
`merged text block: ${preT} + ${curT} => ${mergedText}`
393393
);

lib/pdffont.js

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -558,22 +558,29 @@ export default class PDFFont {
558558
}
559559

560560
/**
561-
* Encode text for output
561+
* Encode text for output - preserves UTF-8 multi-byte characters
562+
* NOTE: Breaking change in v3.3.0 - removed URI encoding to fix issue #385
563+
* Chinese/Japanese/Korean and other multi-byte characters now output as UTF-8
562564
* @param {string} str - The string to encode
563-
* @returns {string} - The encoded string
565+
* @returns {string} - The encoded string with legacy character replacements
564566
*/
565567
flashEncode(str) {
566-
let retVal = encodeURIComponent(str);
567-
retVal = retVal.replace('%C2%96', '-');
568-
retVal = retVal.replace('%C2%91', '%27');
569-
retVal = retVal.replace('%C2%92', '%27');
570-
retVal = retVal.replace('%C2%82', '%27');
571-
retVal = retVal.replace('%C2%93', '%22');
572-
retVal = retVal.replace('%C2%94', '%22');
573-
retVal = retVal.replace('%C2%84', '%22');
574-
retVal = retVal.replace('%C2%8B', '%C2%AB');
575-
retVal = retVal.replace('%C2%9B', '%C2%BB');
576-
568+
if (!str) return str;
569+
570+
let retVal = str;
571+
572+
// Apply legacy Flash-specific character replacements
573+
// These handle problematic characters from old PDF encodings
574+
retVal = retVal.replace(/\u0096/g, '-'); // En dash
575+
retVal = retVal.replace(/\u0091/g, "'"); // Left single quote
576+
retVal = retVal.replace(/\u0092/g, "'"); // Right single quote
577+
retVal = retVal.replace(/\u0082/g, "'"); // Low single quote
578+
retVal = retVal.replace(/\u0093/g, '"'); // Left double quote
579+
retVal = retVal.replace(/\u0094/g, '"'); // Right double quote
580+
retVal = retVal.replace(/\u0084/g, '"'); // Low double quote
581+
retVal = retVal.replace(/\u008B/g, '«'); // Left guillemet
582+
retVal = retVal.replace(/\u009B/g, '»'); // Right guillemet
583+
577584
return retVal;
578585
}
579586

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "pdf2json",
3-
"version": "3.2.3",
3+
"version": "3.3.0",
44
"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
55
"keywords": [
66
"pdf",

src/cli/p2jcli.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ class PDFProcessor {
137137
);
138138
}
139139

140-
const outputStream = fs.createWriteStream(this.outputPath);
140+
const outputStream = fs.createWriteStream(this.outputPath, { encoding: 'utf8' });
141141
outputStream.on("finish", () => this.onPrimarySuccess(resolve, reject));
142142
outputStream.on("error", (err) => this.onPrimaryError(err, reject));
143143

@@ -163,7 +163,7 @@ class PDFProcessor {
163163
}
164164

165165
this.pdfParser.on("pdfParser_dataReady", (evtData: PDFParserData) => {
166-
fs.writeFile(this.outputPath, JSON.stringify(evtData), (err) => {
166+
fs.writeFile(this.outputPath, JSON.stringify(evtData), 'utf8', (err) => {
167167
if (err) {
168168
this.onPrimaryError(err, reject);
169169
} else {

test/_test_type3glyph.cjs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,27 @@ describe('Type3 Glyph Font Tests', () => {
3232
expect(pdfData).toBeDefined();
3333
expect(pdfData.Pages).toBeDefined();
3434
expect(pdfData.Pages.length).toBe(1);
35-
3635
const page = pdfData.Pages[0];
3736
expect(page.Texts).toBeDefined();
3837
expect(page.Texts.length).toBe(2); // Should have both Type3 and regular text
3938

4039
// Check for Type3 text "CONTENT"
4140
const type3Text = page.Texts.find(text =>
42-
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
41+
text.R && text.R[0] && text.R[0].T === 'CONTENT'
4342
);
4443
expect(type3Text).toBeDefined();
45-
expect(type3Text.R[0].T).toBe('CONTENT');
44+
expect((type3Text.R[0].T)).toBe('CONTENT');
4645

4746
// Check for regular text "Added Text from Acrobat"
4847
const regularText = page.Texts.find(text =>
49-
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'Added Text from Acrobat'
48+
text.R && text.R[0] && text.R[0].T === 'Added Text from Acrobat'
5049
);
5150
expect(regularText).toBeDefined();
52-
expect(decodeURIComponent(regularText.R[0].T)).toBe('Added Text from Acrobat');
51+
expect(regularText.R[0].T).toBe('Added Text from Acrobat');
5352

5453
console.log('✓ Type3 glyph font parsing successful');
55-
console.log(`✓ Found Type3 text: "${decodeURIComponent(type3Text.R[0].T)}"`);
56-
console.log(`✓ Found regular text: "${decodeURIComponent(regularText.R[0].T)}"`);
54+
console.log(`✓ Found Type3 text: "${type3Text.R[0].T}"`);
55+
console.log(`✓ Found regular text: "${regularText.R[0].T}"`);
5756

5857
resolve();
5958
} catch (error) {
@@ -89,7 +88,7 @@ describe('Type3 Glyph Font Tests', () => {
8988
page.Texts.forEach(text => {
9089
if (text.R) {
9190
text.R.forEach(run => {
92-
contentOutput += decodeURIComponent(run.T) + '\n';
91+
contentOutput += run.T + '\n';
9392
});
9493
}
9594
});
@@ -104,7 +103,7 @@ describe('Type3 Glyph Font Tests', () => {
104103

105104
expect(parsedJson.Pages[0].Texts.length).toBe(2);
106105
expect(jsonContent).toContain('CONTENT');
107-
expect(jsonContent).toContain('Added%20Text%20from%20Acrobat');
106+
expect(jsonContent).toContain('Added Text from Acrobat');
108107

109108
// Verify content file exists and contains both texts
110109
expect(fs.existsSync(contentOutputPath)).toBe(true);
@@ -139,7 +138,7 @@ describe('Type3 Glyph Font Tests', () => {
139138

140139
// Find Type3 text
141140
const type3Text = page.Texts.find(text =>
142-
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
141+
text.R && text.R[0] && text.R[0].T === 'CONTENT'
143142
);
144143

145144
// Verify Type3 text has proper positioning

0 commit comments

Comments
 (0)