fix: #385 [3.3.0 BREAKING CHANGE] removed encodeURIComponent and ensure utf8 extraction and output (#410)

modesty · web-flow · commit 7b05aa968cee · 2025-10-04T20:09:54.000-07:00
Complete fix for issue #385 - encoding and partial character issues ## BREAKING CHANGE: Text in JSON output is no longer URI-encoded. Fixed two issues: 1. Removed URI encoding - Chinese/CJK characters now display as UTF-8 2. Fixed missing characters - glyphs marked as disabled now extracted correctly *. Removed encodeURIComponent() from flashEncode() *. Moved text extraction outside glyph.disabled check in canvas.js *. Added explicit UTF-8 encoding to all file operations *. Updated tests to work with direct text access *. Bumped version to 3.3.0
diff --git a/base/display/canvas.js b/base/display/canvas.js
@@ -1239,15 +1239,19 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
 
             //MQZ. Feb.20.2013. Disable character based painting, make it a string
 //            this.paintChar(character, scaledX, scaledY);
-            str += glyph.unicode || character;
             if (accent) {
               scaledAccentX = scaledX + accent.offset.x / fontSizeScale;
               scaledAccentY = scaledY - accent.offset.y / fontSizeScale;
                 //MQZ. Feb.20.2013. Disable character based painting, make it a string
 //              this.paintChar(accent.fontChar, scaledAccentX, scaledAccentY);
-//                str += accent.fontChar;
             }
           }
+          
+          // Always extract text for pdf2json, even if glyph is disabled for rendering (fixes issue #385)
+          str += glyph.unicode || character;
+          if (accent) {
+            // str += accent.fontChar;  // Accent characters handled above
+          }
 
           x += charWidth;
 
diff --git a/lib/parserstream.js b/lib/parserstream.js
@@ -11,7 +11,7 @@ export class ParserStream extends Transform {
 	}
 
     static createOutputStream(outputPath, resolve, reject) {
-		const outputStream = fs.createWriteStream(outputPath);
+		const outputStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
 		outputStream.on('finish', () => resolve(outputPath));
 		outputStream.on('error', err => reject(err) );
 		return outputStream;
@@ -71,7 +71,7 @@ export class StringifyStream extends Transform {
     }
 
     _transform(obj, encoding, callback){
-        this.push(JSON.stringify(obj));
+        this.push(JSON.stringify(obj), 'utf8');
         callback();
     }
 }
diff --git a/lib/pdf.js b/lib/pdf.js
@@ -366,7 +366,7 @@ export default class PDFJSClass extends EventEmitter {
         const isDup = j > 0 && PDFFont.areDuplicateBlocks(page.Texts[j - 1], t);
         if (isDup) {
           PJS.info(
-            `skipped: dup text block: ${decodeURIComponent(t.R[0].T)}`
+            `skipped: dup text block: ${t.R[0].T}`
           );
         }
         return !isDup;
@@ -380,14 +380,14 @@ export default class PDFJSClass extends EventEmitter {
             PDFFont.areAdjacentBlocks(prevText, text) &&
             PDFFont.haveSameStyle(prevText, text)
           ) {
-            const preT = decodeURIComponent(prevText.R[0].T);
-            const curT = decodeURIComponent(text.R[0].T);
+            const preT = prevText.R[0].T;
+            const curT = text.R[0].T;
 
             prevText.R[0].T += text.R[0].T;
             prevText.w += text.w;
             text.merged = true;
 
-            const mergedText = decodeURIComponent(prevText.R[0].T);
+            const mergedText = prevText.R[0].T;
             PJS.info(
               `merged text block: ${preT} + ${curT} => ${mergedText}`
             );
diff --git a/lib/pdffont.js b/lib/pdffont.js
@@ -558,22 +558,29 @@ export default class PDFFont {
    }
 
    /**
-    * Encode text for output
+    * Encode text for output - preserves UTF-8 multi-byte characters
+    * NOTE: Breaking change in v3.3.0 - removed URI encoding to fix issue #385
+    * Chinese/Japanese/Korean and other multi-byte characters now output as UTF-8
     * @param {string} str - The string to encode
-    * @returns {string} - The encoded string
+    * @returns {string} - The encoded string with legacy character replacements
     */
    flashEncode(str) {
-      let retVal = encodeURIComponent(str);
-      retVal = retVal.replace('%C2%96', '-');
-      retVal = retVal.replace('%C2%91', '%27');
-      retVal = retVal.replace('%C2%92', '%27');
-      retVal = retVal.replace('%C2%82', '%27');
-      retVal = retVal.replace('%C2%93', '%22');
-      retVal = retVal.replace('%C2%94', '%22');
-      retVal = retVal.replace('%C2%84', '%22');
-      retVal = retVal.replace('%C2%8B', '%C2%AB');
-      retVal = retVal.replace('%C2%9B', '%C2%BB');
-
+      if (!str) return str;
+      
+      let retVal = str;
+      
+      // Apply legacy Flash-specific character replacements
+      // These handle problematic characters from old PDF encodings
+      retVal = retVal.replace(/\u0096/g, '-');      // En dash
+      retVal = retVal.replace(/\u0091/g, "'");      // Left single quote
+      retVal = retVal.replace(/\u0092/g, "'");      // Right single quote
+      retVal = retVal.replace(/\u0082/g, "'");      // Low single quote
+      retVal = retVal.replace(/\u0093/g, '"');      // Left double quote
+      retVal = retVal.replace(/\u0094/g, '"');      // Right double quote
+      retVal = retVal.replace(/\u0084/g, '"');      // Low double quote
+      retVal = retVal.replace(/\u008B/g, '«');      // Left guillemet
+      retVal = retVal.replace(/\u009B/g, '»');      // Right guillemet
+      
       return retVal;
    }
 
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "pdf2json",
-	"version": "3.2.3",
+	"version": "3.3.0",
 	"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
 	"keywords": [
 		"pdf",
diff --git a/src/cli/p2jcli.ts b/src/cli/p2jcli.ts
@@ -137,7 +137,7 @@ class PDFProcessor {
 				);
 			}
 
-			const outputStream = fs.createWriteStream(this.outputPath);
+			const outputStream = fs.createWriteStream(this.outputPath, { encoding: 'utf8' });
 			outputStream.on("finish", () => this.onPrimarySuccess(resolve, reject));
 			outputStream.on("error", (err) => this.onPrimaryError(err, reject));
 
@@ -163,7 +163,7 @@ class PDFProcessor {
 			}
 
 			this.pdfParser.on("pdfParser_dataReady", (evtData: PDFParserData) => {
-				fs.writeFile(this.outputPath, JSON.stringify(evtData), (err) => {
+				fs.writeFile(this.outputPath, JSON.stringify(evtData), 'utf8', (err) => {
 					if (err) {
 						this.onPrimaryError(err, reject);
 					} else {
diff --git a/test/_test_type3glyph.cjs b/test/_test_type3glyph.cjs
@@ -32,28 +32,27 @@ describe('Type3 Glyph Font Tests', () => {
 					expect(pdfData).toBeDefined();
 					expect(pdfData.Pages).toBeDefined();
 					expect(pdfData.Pages.length).toBe(1);
-					
 					const page = pdfData.Pages[0];
 					expect(page.Texts).toBeDefined();
 					expect(page.Texts.length).toBe(2); // Should have both Type3 and regular text
 					
 					// Check for Type3 text "CONTENT"
 					const type3Text = page.Texts.find(text => 
-						text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
+						text.R && text.R[0] && text.R[0].T === 'CONTENT'
 					);
 					expect(type3Text).toBeDefined();
-					expect(type3Text.R[0].T).toBe('CONTENT');
+					expect((type3Text.R[0].T)).toBe('CONTENT');
 					
 					// Check for regular text "Added Text from Acrobat"
 					const regularText = page.Texts.find(text => 
-						text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'Added Text from Acrobat'
+						text.R && text.R[0] && text.R[0].T === 'Added Text from Acrobat'
 					);
 					expect(regularText).toBeDefined();
-					expect(decodeURIComponent(regularText.R[0].T)).toBe('Added Text from Acrobat');
+					expect(regularText.R[0].T).toBe('Added Text from Acrobat');
 					
 					console.log('✓ Type3 glyph font parsing successful');
-					console.log(`✓ Found Type3 text: "${decodeURIComponent(type3Text.R[0].T)}"`);
-					console.log(`✓ Found regular text: "${decodeURIComponent(regularText.R[0].T)}"`);
+					console.log(`✓ Found Type3 text: "${type3Text.R[0].T}"`);
+					console.log(`✓ Found regular text: "${regularText.R[0].T}"`);
 					
 					resolve();
 				} catch (error) {
@@ -89,7 +88,7 @@ describe('Type3 Glyph Font Tests', () => {
 						page.Texts.forEach(text => {
 							if (text.R) {
 								text.R.forEach(run => {
-									contentOutput += decodeURIComponent(run.T) + '\n';
+									contentOutput += run.T + '\n';
 								});
 							}
 						});
@@ -104,7 +103,7 @@ describe('Type3 Glyph Font Tests', () => {
 					
 					expect(parsedJson.Pages[0].Texts.length).toBe(2);
 					expect(jsonContent).toContain('CONTENT');
-					expect(jsonContent).toContain('Added%20Text%20from%20Acrobat');
+					expect(jsonContent).toContain('Added Text from Acrobat');
 					
 					// Verify content file exists and contains both texts
 					expect(fs.existsSync(contentOutputPath)).toBe(true);
@@ -139,7 +138,7 @@ describe('Type3 Glyph Font Tests', () => {
 					
 					// Find Type3 text
 					const type3Text = page.Texts.find(text => 
-						text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
+						text.R && text.R[0] && text.R[0].T === 'CONTENT'
 					);
 					
 					// Verify Type3 text has proper positioning

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ export class ParserStream extends Transform {`
`11`	`11`	`}`
`12`	`12`
`13`	`13`	`static createOutputStream(outputPath, resolve, reject) {`
`14`		`- const outputStream = fs.createWriteStream(outputPath);`
	`14`	`+ const outputStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });`
`15`	`15`	`outputStream.on('finish', () => resolve(outputPath));`
`16`	`16`	`outputStream.on('error', err => reject(err) );`
`17`	`17`	`return outputStream;`
`@@ -71,7 +71,7 @@ export class StringifyStream extends Transform {`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`_transform(obj, encoding, callback){`
`74`		`- this.push(JSON.stringify(obj));`
	`74`	`+ this.push(JSON.stringify(obj), 'utf8');`
`75`	`75`	`callback();`
`76`	`76`	`}`
`77`	`77`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "pdf2json",`
`3`		`- "version": "3.2.3",`
	`3`	`+ "version": "3.3.0",`
`4`	`4`	`"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",`
`5`	`5`	`"keywords": [`
`6`	`6`	`"pdf",`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ class PDFProcessor {`
`137`	`137`	`);`
`138`	`138`	`}`
`139`	`139`
`140`		`- const outputStream = fs.createWriteStream(this.outputPath);`
	`140`	`+ const outputStream = fs.createWriteStream(this.outputPath, { encoding: 'utf8' });`
`141`	`141`	`outputStream.on("finish", () => this.onPrimarySuccess(resolve, reject));`
`142`	`142`	`outputStream.on("error", (err) => this.onPrimaryError(err, reject));`
`143`	`143`
`@@ -163,7 +163,7 @@ class PDFProcessor {`
`163`	`163`	`}`
`164`	`164`
`165`	`165`	`this.pdfParser.on("pdfParser_dataReady", (evtData: PDFParserData) => {`
`166`		`- fs.writeFile(this.outputPath, JSON.stringify(evtData), (err) => {`
	`166`	`+ fs.writeFile(this.outputPath, JSON.stringify(evtData), 'utf8', (err) => {`
`167`	`167`	`if (err) {`
`168`	`168`	`this.onPrimaryError(err, reject);`
`169`	`169`	`} else {`