Skip to content

Commit f1ed753

Browse files
author
Andrey
committed
Added text extraction API
1 parent 8bf7f97 commit f1ed753

File tree

1 file changed

+87
-11
lines changed

1 file changed

+87
-11
lines changed

index.js

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,35 +36,41 @@ app.get('/files/:filename', (req, res) => {
3636
app.get('/optimize/:filename', (req, res) => {
3737
const filename = req.params.filename;
3838
const ext = path.parse(filename).ext;
39-
39+
4040
const inputPath = path.resolve(__dirname, filesPath, filename);
41-
const outputPath = path.resolve(__dirname, filesPath, `optimized_${filename}`);
41+
const outputPath = path.resolve(
42+
__dirname,
43+
filesPath,
44+
`optimized_${filename}`,
45+
);
4246

4347
if (ext !== '.pdf') {
4448
res.statusCode = 500;
45-
res.end(`Only PDFs can be optimized. Cannot optimize file with extension: ${ext}.`);
49+
res.end(
50+
`Only PDFs can be optimized. Cannot optimize file with extension: ${ext}.`,
51+
);
4652
}
4753

4854
const main = async () => {
4955
const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath);
5056
await doc.initSecurityHandler();
51-
57+
5258
// compress
5359
const image_settings = new PDFNet.Optimizer.ImageSettings();
5460
image_settings.setCompressionMode(
5561
PDFNet.Optimizer.ImageSettings.CompressionMode.e_jpeg,
5662
);
57-
63+
5864
const opt_settings = new PDFNet.Optimizer.OptimizerSettings();
5965
opt_settings.setColorImageSettings(image_settings);
6066
opt_settings.setGrayscaleImageSettings(image_settings);
61-
67+
6268
await PDFNet.Optimizer.optimize(doc, opt_settings);
6369

6470
// viewer optimizer + linearization
6571
const opts = new PDFNet.PDFDoc.ViewerOptimizedOptions();
6672
opts.setThumbnailRenderingThreshold(0);
67-
73+
6874
await doc.saveViewerOptimized(outputPath, opts);
6975
};
7076

@@ -80,7 +86,9 @@ app.get('/thumbnail/:filename', (req, res) => {
8086

8187
if (ext !== '.pdf') {
8288
res.statusCode = 500;
83-
res.end(`Only PDFs can return a thumbnail. Cannot return a thumb for a file with extension: ${ext}.`);
89+
res.end(
90+
`Only PDFs can return a thumbnail. Cannot return a thumb for a file with extension: ${ext}.`,
91+
);
8492
}
8593

8694
const main = async () => {
@@ -110,15 +118,83 @@ app.get('/convert/:filename', (req, res) => {
110118
const pdfdoc = await PDFNet.PDFDoc.create();
111119
await pdfdoc.initSecurityHandler();
112120
await PDFNet.Convert.toPdf(pdfdoc, inputPath);
113-
pdfdoc.save(`${pathname}${filename}.pdf`, PDFNet.SDFDoc.SaveOptions.e_linearized);
121+
pdfdoc.save(
122+
`${pathname}${filename}.pdf`,
123+
PDFNet.SDFDoc.SaveOptions.e_linearized,
124+
);
114125
ext = '.pdf';
115126
};
116127

117128
PDFNetEndpoint(main, outputPath, res);
118129
});
119130

131+
app.get('/textextract/:filename-:outext-:pagenumber', (req, res) => {
132+
const filename = req.params.filename;
133+
let outputExt = req.params.outext;
134+
let pageNumber = Number(req.params.pagenumber);
135+
let ext = path.parse(filename).ext;
136+
137+
if (ext !== '.pdf') {
138+
res.statusCode = 500;
139+
res.end(`File is not a PDF. Please convert it first.`);
140+
}
141+
142+
if (!outputExt) {
143+
outputExt = 'txt';
144+
}
145+
146+
const inputPath = path.resolve(__dirname, filesPath, filename);
147+
const outputPath = path.resolve(
148+
__dirname,
149+
filesPath,
150+
`${filename}.${outputExt}`,
151+
);
152+
153+
const main = async () => {
154+
await PDFNet.initialize();
155+
try {
156+
await PDFNet.startDeallocateStack();
157+
const pdfdoc = await PDFNet.PDFDoc.createFromFilePath(inputPath);
158+
await pdfdoc.initSecurityHandler();
159+
const page = await pdfdoc.getPage(pageNumber);
160+
161+
if (page.id === '0') {
162+
console.log('Page not found.');
163+
return 1;
164+
}
165+
166+
const txt = await PDFNet.TextExtractor.create();
167+
const rect = new PDFNet.Rect(0, 0, 612, 794);
168+
txt.begin(page, rect);
169+
let text;
170+
if (outputExt === 'xml') {
171+
text = await txt.getAsXML(
172+
PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements |
173+
PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox |
174+
PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info,
175+
);
176+
fs.writeFile(outputPath, text, (err) => {
177+
if (err) return console.log(err);
178+
});
179+
} else {
180+
text = await txt.getAsText();
181+
fs.writeFile(outputPath, text, (err) => {
182+
if (err) return console.log(err);
183+
});
184+
}
185+
await PDFNet.endDeallocateStack();
186+
} catch (err) {
187+
console.log(err);
188+
console.log(err.stack);
189+
return 1;
190+
}
191+
};
192+
193+
PDFNetEndpoint(main, outputPath, res);
194+
});
195+
120196
const PDFNetEndpoint = (main, pathname, res) => {
121-
PDFNet.runWithCleanup(main)
197+
PDFNet.runWithCleanup(main)
122198
.catch(function (error) {
123199
res.statusCode = 500;
124200
res.end(`Error : ${JSON.stringify(error)}.`);
@@ -142,4 +218,4 @@ app.listen(port, () =>
142218
console.log(
143219
`nodejs-convert-file-server listening at http://localhost:${port}`,
144220
),
145-
);
221+
);

0 commit comments

Comments
 (0)