-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathTikaParser.js
More file actions
79 lines (70 loc) · 3.04 KB
/
TikaParser.js
File metadata and controls
79 lines (70 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
var tikaParser = function (doc) {
function tikaParse(doc) {
var File = java.io.File;
var FileInputStream = java.io.FileInputStream;
var IOException = java.io.IOException;
var InputStream = java.io.InputStream;
var HashMap = java.util.HashMap;
var Tika = org.apache.tika.Tika;
var Metadata = org.apache.tika.metadata.Metadata;
var AutoDetectParser = org.apache.tika.parser.AutoDetectParser;
var ParseContext = org.apache.tika.parser.ParseContext;
var OOXMLParser = org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
var PDFParser = org.apache.tika.parser.pdf.PDFParser;
var BodyContentHandler = org.apache.tika.sax.BodyContentHandler;
var xContentHandler = org.xml.sax.ContentHandler;
var String = java.lang.String;
var URL = java.net.URL;
var url = doc.getId();
var autoParser = new AutoDetectParser();
var tika = new Tika();
var pdfParser = new PDFParser();
// Get ready to parse the file.
var textHandler = new BodyContentHandler(-1);
var metadata = new Metadata();
var context = new ParseContext();
var map = new HashMap();
var ioe = java.io.IOException;
var metadataNames = Java.type("String[]");
var content = "";
try {
var urlobj = new URL(url);
var input = urlobj.openStream();
if ("application/pdf".equals(tika.detect(urlobj))) {
pdfParser.parse(input, textHandler, metadata, context);
metadataNames = metadata.names();
content = textHandler.toString();
} else if ("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".equals(tika.detect(urlobj))) {
logger.info("Using XslX parser...");
var msofficeparser = new OOXMLParser();
msofficeparser.parse(input, textHandler, metadata, context);
content = textHandler.toString();
// logger.info("Contents of the document:" + content);
logger.info("Metadata of the document:");
metadataNames = metadata.names();
} else {
autoParser.parse(input, textHandler, metadata, context);
metadataNames = metadata.names();
content = textHandler.toString();
}
if (content !== null) {
doc.addField("_raw_content_", content);
}
if (metadataNames !== null) {
for (var name in metadataNames) {
logger.info(name + ": " + metadata.get(name));
doc.addField(name, metadata.get(name));
}
}
} catch (ioe) {
logger.error(ioe);
}
return doc;
}
return doc;
}