1+ /**
2+ * @import PDFDocument from '../../lib/document';
3+ */
4+
5+ /**
6+ * @typedef {object } TextStream
7+ * @property {string } text
8+ * @property {string } font
9+ * @property {number } fontSize
10+ *
11+ * @typedef {string | Buffer } PDFDataItem
12+ * @typedef {Array<PDFDataItem> } PDFData
13+ *
14+ * @typedef {object } PDFDataObject
15+ * @property {PDFDataItem[] } items
16+ */
17+
18+ /**
19+ * @param {PDFDocument } doc
20+ * @return {PDFData }
21+ */
122function logData ( doc ) {
223 const loggedData = [ ] ;
324 const originalMethod = doc . _write ;
@@ -18,4 +39,83 @@ function joinTokens(...args) {
1839 return r ;
1940}
2041
21- export { logData , joinTokens } ;
42+ /**
43+ * @description
44+ * Returns an array of objects from the PDF data. Object items are surrounded by /\d 0 obj/ and 'endobj'.
45+ * @param {PDFData } data
46+ * @return {Array<PDFDataObject> }
47+ */
48+ function getObjects ( data ) {
49+ const objects = [ ] ;
50+ let currentObject = null ;
51+ for ( const item of data ) {
52+ if ( item instanceof Buffer ) {
53+ if ( currentObject ) {
54+ currentObject . items . push ( item ) ;
55+ }
56+ } else if ( typeof item === 'string' ) {
57+ if ( / ^ \d + \s 0 \s o b j / . test ( item ) ) {
58+ currentObject = { items : [ ] } ;
59+ objects . push ( currentObject ) ;
60+ } else if ( item === 'endobj' ) {
61+ currentObject = null ;
62+ } else if ( currentObject ) {
63+ currentObject . items . push ( item ) ;
64+ }
65+ }
66+ }
67+ return objects ;
68+ }
69+
70+ /**
71+ * @param {Buffer } textStream
72+ * @return {TextStream | undefined }
73+ */
74+ function parseTextStream ( textStream ) {
75+ const decodedStream = textStream . toString ( 'utf8' ) ;
76+
77+ // Extract font and font size
78+ const fontMatch = decodedStream . match ( / \/ ( [ A - Z a - z 0 - 9 ] + ) \s + ( \d + ) \s + T f / ) ;
79+
80+ if ( ! fontMatch ) {
81+ return undefined ;
82+ }
83+
84+ const font = fontMatch [ 1 ] ;
85+ const fontSize = parseInt ( fontMatch [ 2 ] , 10 ) ;
86+
87+ // Extract hex strings inside TJ array
88+ const tjMatch = decodedStream . match ( / \[ ( [ ^ \] ] + ) \] \s + T J / ) ;
89+ if ( ! tjMatch ) {
90+ return undefined ;
91+ }
92+ let text = '' ;
93+
94+ // this is a simplified version
95+ // the correct way is to retrieve the encoding from /Resources /Font dictionary and decode using it
96+ // https://stackoverflow.com/a/29468049/5724645
97+
98+ // Match all hex strings like <...>
99+ const hexMatches = [ ...tjMatch [ 1 ] . matchAll ( / < ( [ 0 - 9 a - f A - F ] + ) > / g) ] ;
100+ for ( const m of hexMatches ) {
101+ // Convert hex to string
102+ const hex = m [ 1 ] ;
103+ for ( let i = 0 ; i < hex . length ; i += 2 ) {
104+ const code = parseInt ( hex . substr ( i , 2 ) , 16 ) ;
105+ let char = String . fromCharCode ( code ) ;
106+ // Handle special cases
107+ if ( code === 0x0a ) {
108+ char = '\n' ; // Newline
109+ } else if ( code === 0x0d ) {
110+ char = '\r' ; // Carriage return
111+ } else if ( code === 0x85 ) {
112+ char = '...' ;
113+ }
114+ text += char ;
115+ }
116+ }
117+
118+ return { text, font, fontSize } ;
119+ }
120+
121+ export { logData , joinTokens , parseTextStream , getObjects } ;
0 commit comments