@@ -36,35 +36,41 @@ app.get('/files/:filename', (req, res) => {
3636app . get ( '/optimize/:filename' , ( req , res ) => {
3737 const filename = req . params . filename ;
3838 const ext = path . parse ( filename ) . ext ;
39-
39+
4040 const inputPath = path . resolve ( __dirname , filesPath , filename ) ;
41- const outputPath = path . resolve ( __dirname , filesPath , `optimized_${ filename } ` ) ;
41+ const outputPath = path . resolve (
42+ __dirname ,
43+ filesPath ,
44+ `optimized_${ filename } ` ,
45+ ) ;
4246
4347 if ( ext !== '.pdf' ) {
4448 res . statusCode = 500 ;
45- res . end ( `Only PDFs can be optimized. Cannot optimize file with extension: ${ ext } .` ) ;
49+ res . end (
50+ `Only PDFs can be optimized. Cannot optimize file with extension: ${ ext } .` ,
51+ ) ;
4652 }
4753
4854 const main = async ( ) => {
4955 const doc = await PDFNet . PDFDoc . createFromFilePath ( inputPath ) ;
5056 await doc . initSecurityHandler ( ) ;
51-
57+
5258 // compress
5359 const image_settings = new PDFNet . Optimizer . ImageSettings ( ) ;
5460 image_settings . setCompressionMode (
5561 PDFNet . Optimizer . ImageSettings . CompressionMode . e_jpeg ,
5662 ) ;
57-
63+
5864 const opt_settings = new PDFNet . Optimizer . OptimizerSettings ( ) ;
5965 opt_settings . setColorImageSettings ( image_settings ) ;
6066 opt_settings . setGrayscaleImageSettings ( image_settings ) ;
61-
67+
6268 await PDFNet . Optimizer . optimize ( doc , opt_settings ) ;
6369
6470 // viewer optimizer + linearization
6571 const opts = new PDFNet . PDFDoc . ViewerOptimizedOptions ( ) ;
6672 opts . setThumbnailRenderingThreshold ( 0 ) ;
67-
73+
6874 await doc . saveViewerOptimized ( outputPath , opts ) ;
6975 } ;
7076
@@ -80,7 +86,9 @@ app.get('/thumbnail/:filename', (req, res) => {
8086
8187 if ( ext !== '.pdf' ) {
8288 res . statusCode = 500 ;
83- res . end ( `Only PDFs can return a thumbnail. Cannot return a thumb for a file with extension: ${ ext } .` ) ;
89+ res . end (
90+ `Only PDFs can return a thumbnail. Cannot return a thumb for a file with extension: ${ ext } .` ,
91+ ) ;
8492 }
8593
8694 const main = async ( ) => {
@@ -110,15 +118,83 @@ app.get('/convert/:filename', (req, res) => {
110118 const pdfdoc = await PDFNet . PDFDoc . create ( ) ;
111119 await pdfdoc . initSecurityHandler ( ) ;
112120 await PDFNet . Convert . toPdf ( pdfdoc , inputPath ) ;
113- pdfdoc . save ( `${ pathname } ${ filename } .pdf` , PDFNet . SDFDoc . SaveOptions . e_linearized ) ;
121+ pdfdoc . save (
122+ `${ pathname } ${ filename } .pdf` ,
123+ PDFNet . SDFDoc . SaveOptions . e_linearized ,
124+ ) ;
114125 ext = '.pdf' ;
115126 } ;
116127
117128 PDFNetEndpoint ( main , outputPath , res ) ;
118129} ) ;
119130
131+ app . get ( '/textextract/:filename-:outext-:pagenumber' , ( req , res ) => {
132+ const filename = req . params . filename ;
133+ let outputExt = req . params . outext ;
134+ let pageNumber = Number ( req . params . pagenumber ) ;
135+ let ext = path . parse ( filename ) . ext ;
136+
137+ if ( ext !== '.pdf' ) {
138+ res . statusCode = 500 ;
139+ res . end ( `File is not a PDF. Please convert it first.` ) ;
140+ }
141+
142+ if ( ! outputExt ) {
143+ outputExt = 'txt' ;
144+ }
145+
146+ const inputPath = path . resolve ( __dirname , filesPath , filename ) ;
147+ const outputPath = path . resolve (
148+ __dirname ,
149+ filesPath ,
150+ `${ filename } .${ outputExt } ` ,
151+ ) ;
152+
153+ const main = async ( ) => {
154+ await PDFNet . initialize ( ) ;
155+ try {
156+ await PDFNet . startDeallocateStack ( ) ;
157+ const pdfdoc = await PDFNet . PDFDoc . createFromFilePath ( inputPath ) ;
158+ await pdfdoc . initSecurityHandler ( ) ;
159+ const page = await pdfdoc . getPage ( pageNumber ) ;
160+
161+ if ( page . id === '0' ) {
162+ console . log ( 'Page not found.' ) ;
163+ return 1 ;
164+ }
165+
166+ const txt = await PDFNet . TextExtractor . create ( ) ;
167+ const rect = new PDFNet . Rect ( 0 , 0 , 612 , 794 ) ;
168+ txt . begin ( page , rect ) ;
169+ let text ;
170+ if ( outputExt === 'xml' ) {
171+ text = await txt . getAsXML (
172+ PDFNet . TextExtractor . XMLOutputFlags . e_words_as_elements |
173+ PDFNet . TextExtractor . XMLOutputFlags . e_output_bbox |
174+ PDFNet . TextExtractor . XMLOutputFlags . e_output_style_info ,
175+ ) ;
176+ fs . writeFile ( outputPath , text , ( err ) => {
177+ if ( err ) return console . log ( err ) ;
178+ } ) ;
179+ } else {
180+ text = await txt . getAsText ( ) ;
181+ fs . writeFile ( outputPath , text , ( err ) => {
182+ if ( err ) return console . log ( err ) ;
183+ } ) ;
184+ }
185+ await PDFNet . endDeallocateStack ( ) ;
186+ } catch ( err ) {
187+ console . log ( err ) ;
188+ console . log ( err . stack ) ;
189+ return 1 ;
190+ }
191+ } ;
192+
193+ PDFNetEndpoint ( main , outputPath , res ) ;
194+ } ) ;
195+
120196const PDFNetEndpoint = ( main , pathname , res ) => {
121- PDFNet . runWithCleanup ( main )
197+ PDFNet . runWithCleanup ( main )
122198 . catch ( function ( error ) {
123199 res . statusCode = 500 ;
124200 res . end ( `Error : ${ JSON . stringify ( error ) } .` ) ;
@@ -142,4 +218,4 @@ app.listen(port, () =>
142218 console . log (
143219 `nodejs-convert-file-server listening at http://localhost:${ port } ` ,
144220 ) ,
145- ) ;
221+ ) ;
0 commit comments