Skip to content

Commit 716a223

Browse files
committed
Merge master and resolve conflicts
2 parents 84744c0 + b5bd593 commit 716a223

File tree

7 files changed

+163
-19
lines changed

7 files changed

+163
-19
lines changed

README.md

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,12 @@ print(parsed["metadata"])
5252
print(parsed["content"])
5353
```
5454

55-
Parser Interface (new)
55+
Parser Interface
5656
----------------------
57+
The parser interface extracts text and metadata using the /rmeta
58+
interface. THis is one of the better ways to get the internal XHTML
59+
content extracted.
60+
5761
```
5862
#!/usr/bin/env python
5963
import tika
@@ -68,17 +72,37 @@ parsed = parser.from_file('/path/to/file', 'http://tika:9998/tika')
6872
string_parsed = parser.from_buffer('Good evening, Dave', 'http://tika:9998/tika')
6973
```
7074

71-
Detect Interface (new)
75+
Unpack Interface
76+
----------------
77+
The unpack interface handles both metadata and text extraction in a single
78+
call and internally returns back a tarball of metadata and text entries that
79+
is internally unpacked, reducing the wire load for extraction.
80+
81+
```
82+
#!/usr/bin/env python
83+
import tika
84+
from tika import unpack
85+
parsed = unpack.from_file('/path/to/file')
86+
```
87+
88+
Detect Interface
7289
----------------------
90+
The detect interface provides a IANA MIME type classification for the
91+
provided file.
92+
7393
```
7494
#!/usr/bin/env python
7595
import tika
7696
from tika import detector
7797
print(detector.from_file('/path/to/file'))
7898
```
7999

80-
Config Interface (new)
100+
Config Interface
81101
----------------------
102+
The config interface allows you to inspect the Tika Server environment's
103+
configuration including what parsers, mime types, and detectors the
104+
server has been configured with.
105+
82106
```
83107
#!/usr/bin/env python
84108
import tika
@@ -88,16 +112,22 @@ print(config.getMimeTypes())
88112
print(config.getDetectors())
89113
```
90114

91-
Language Detection Interface (new)
115+
Language Detection Interface
92116
---------------------------------
117+
The language detection interface provides a 2 character language
118+
code texted based on the text in provided file.
119+
93120
```
94121
#!/usr/bin/env python
95122
from tika import language
96123
print(language.from_file('/path/to/file'))
97124
```
98125

99-
Translate Interface (new)
126+
Translate Interface
100127
------------------------
128+
The translate interface translates the text automatically extracted
129+
by Tika from the source language to the destination language.
130+
101131
```
102132
#!/usr/bin/env python
103133
from tika import translate

tika/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
__version__ = "1.14"
17+
__version__ = "1.14.2"
1818

1919
try:
2020
__import__('pkg_resources').declare_namespace(__name__)

tika/tests/arguments/test_remote_content.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
,"https://catalog.data.gov/dataset/free-application-for-federal-student-aid-201213/resource/f2437ade-d39f-40fb-9dba-19c3b26a8662",,
44
,"https://studentaid.ed.gov/sites/default/files/fsawg/datacenter/library/FAFSAReportDefinitions.doc",,"FAFSAReportDefinitions.doc"
55
,"http://open.defense.gov/Portals/23/Documents/FOIA_WebsiteCompliance.ppt",,"FOIA_WebsiteCompliance.ppt"
6-
,"http://www.ars.usda.gov/is/video/asx/freeze.asx",,"freeze.asx"
76
,"https://catalog.data.gov/dataset/geologic-map-of-the-state-of-hawaii",,"geologic-map-of-the-state-of-hawaii"
87
,"http://data.octo.dc.gov/Attachment.aspx?where=Citywide&area=&what=CSV&date=Issueddate&from=4/12/2015%2012:00:00%20AM&to=4/19/2015%2010:00:00%20PM&dataset=DCRA_PERMIT&datasetid=5&whereInd=0&areaInd=0&whatInd=0&dateInd=0&whenInd=0",,"DCRA_PERMIT__from04_12_2015__to04_19_2015.csv"
98
,"http://www.dmdc.osd.mil/Rank_Gender_Race.xls",,"Rank_Gender_Race.xls"

tika/tests/arguments/test_remote_metadata.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
,"http://downloads.bbc.co.uk/podcasts/worldservice/6min_vocab/6min_vocab_20150511-1134a.mp3",,"6min_vocab_20150511-1134a.mp3"
1313
,"http://www.ars.usda.gov/is/podcasts/tomatoes.mp4",,"tomatoes.mp4"
1414
,"http://media.ars.usda.gov/is/mp4/freezeplants.mp4",,"freezeplants.mp4"
15-
,"http://www.ars.usda.gov/is/video/asx/freeze.asx",,"freeze.asx"
1615
,"https://catalog.data.gov/dataset/geologic-map-of-the-state-of-hawaii",,"geologic-map-of-the-state-of-hawaii"
1716
,"http://pubs.usgs.gov/of/2007/1089/Haw_St_tabfiles.zip",,"Haw_St_tabfiles.zip"
1817
,"http://data.octo.dc.gov/Attachment.aspx?where=Citywide&area=&what=CSV&date=Issueddate&from=4/12/2015%2012:00:00%20AM&to=4/19/2015%2010:00:00%20PM&dataset=DCRA_PERMIT&datasetid=5&whereInd=0&areaInd=0&whatInd=0&dateInd=0&whenInd=0",,"DCRA_PERMIT__from04_12_2015__to04_19_2015.csv"

tika/tests/arguments/test_unsupported.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ Content-Type,URL,Language,local
55
,http://www.defense.gov/multimedia/web_graphics/coastgrd/USCGb1.eps,,USCGb1.eps
66
,http://www.defense.gov/multimedia/web_graphics/coastgrd/USCGb.wmf,,USCGb.wmf
77
,http://www.ars.usda.gov/is/podcasts/tomatoes.wmv,,tomatoes.wmv
8+
,http://media.ars.usda.gov/is/wm/freeze.wmv,,freeze.wmv

tika/tika.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def make_content_disposition_header(fn):
136136
log_file = os.path.join(log_path, 'tika.log')
137137

138138
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
139-
log = logging.getLogger()
139+
log = logging.getLogger('tika.tika')
140140

141141
# File logs
142142
fileHandler = logging.FileHandler(log_file)
@@ -272,7 +272,7 @@ def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint,
272272

273273
def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
274274
responseMimeType='application/json',
275-
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
275+
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
276276
'''
277277
Parse the objects and return extracted metadata and/or text in JSON format.
278278
:param option:
@@ -289,7 +289,7 @@ def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
289289

290290
def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
291291
responseMimeType='application/json',
292-
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}):
292+
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False):
293293
'''
294294
Parse the object and return extracted metadata and/or text in JSON format.
295295
:param option:
@@ -307,11 +307,9 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
307307
service = services.get(option, services['all'])
308308
if service == '/tika': responseMimeType = 'text/plain'
309309
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
310-
{
311-
'Accept': responseMimeType,
312-
'Content-Disposition': make_content_disposition_header(path)
313-
},
314-
verbose, tikaServerJar)
310+
{'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path)},
311+
verbose, tikaServerJar, rawResponse=rawResponse)
312+
315313

316314
if file_type == 'remote': os.unlink(path)
317315
return (status, response)
@@ -479,8 +477,9 @@ def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServer
479477
return (status, response)
480478

481479

482-
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
483-
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post},classpath=None):
480+
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
481+
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
482+
rawResponse=False):
484483
'''
485484
Call the Tika Server, do some error checking, and return the response.
486485
:param verb:
@@ -522,8 +521,12 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti
522521
print(sys.stderr, "Response headers: ", resp.headers)
523522
if resp.status_code != 200:
524523
log.warning('Tika server returned status: %d', resp.status_code)
524+
525525
resp.encoding = "utf-8"
526-
return (resp.status_code, resp.text)
526+
if rawResponse:
527+
return (resp.status_code, resp.content)
528+
else:
529+
return (resp.status_code, resp.text)
527530

528531

529532
def checkTikaServer(serverHost=ServerHost, port = Port, tikaServerJar=TikaServerJar,classpath=None):

tika/unpack.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
19+
from .tika import parse1, callServer, ServerEndpoint
20+
import tarfile
21+
from io import BytesIO, TextIOWrapper
22+
import csv
23+
from sys import version_info
24+
25+
# Python 3 introduced .readable() to tarfile extracted files objects - this
26+
# is required to wrap a TextIOWrapper around the object. However, wrapping
27+
# with TextIOWrapper is only required for csv.reader() in Python 3, so the
28+
# tarfile returned object can be used as is in earlier versions.
29+
_text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
30+
31+
def from_file(filename, serverEndpoint=ServerEndpoint):
32+
'''
33+
Parse from file
34+
:param filename: file
35+
:param serverEndpoint: Tika server end point (optional)
36+
:return:
37+
'''
38+
tarOutput = parse1('unpack', filename, serverEndpoint,
39+
responseMimeType='application/x-tar',
40+
services={'meta': '/meta', 'text': '/tika',
41+
'all': '/rmeta/xml', 'unpack': '/unpack/all'},
42+
rawResponse=True)
43+
return _parse(tarOutput)
44+
45+
46+
def from_buffer(string, serverEndpoint=ServerEndpoint):
47+
'''
48+
Parse from buffered content
49+
:param string: buffered content
50+
:param serverEndpoint: Tika server URL (Optional)
51+
:return: parsed content
52+
'''
53+
status, response = callServer('put', serverEndpoint, '/unpack/all', string,
54+
{'Accept': 'application/x-tar'}, False,
55+
rawResponse=True)
56+
57+
return _parse((status, response))
58+
59+
60+
def _parse(tarOutput):
61+
parsed = {}
62+
if not tarOutput:
63+
return parsed
64+
elif tarOutput[1] is None or tarOutput[1] == b"":
65+
return parsed
66+
67+
tarFile = tarfile.open(fileobj=BytesIO(tarOutput[1]))
68+
69+
# get the member names
70+
memberNames = list(tarFile.getnames())
71+
72+
# extract the metadata
73+
metadata = {}
74+
if "__METADATA__" in memberNames:
75+
memberNames.remove("__METADATA__")
76+
77+
metadataMember = tarFile.getmember("__METADATA__")
78+
if not metadataMember.issym() and metadataMember.isfile():
79+
metadataFile = _text_wrapper(tarFile.extractfile(metadataMember))
80+
metadataReader = csv.reader(metadataFile)
81+
for metadataLine in metadataReader:
82+
# each metadata line comes as a key-value pair, with list values
83+
# returned as extra values in the line - convert single values
84+
# to non-list values to be consistent with parser metadata
85+
assert len(metadataLine) >= 2
86+
87+
if len(metadataLine) > 2:
88+
metadata[metadataLine[0]] = metadataLine[1:]
89+
else:
90+
metadata[metadataLine[0]] = metadataLine[1]
91+
92+
# get the content
93+
content = ""
94+
if "__TEXT__" in memberNames:
95+
memberNames.remove("__TEXT__")
96+
97+
contentMember = tarFile.getmember("__TEXT__")
98+
if not contentMember.issym() and contentMember.isfile():
99+
content = _text_wrapper(tarFile.extractfile(contentMember)).read()
100+
101+
# get the remaining files as attachments
102+
attachments = {}
103+
for attachment in memberNames:
104+
attachmentMember = tarFile.getmember(attachment)
105+
if not attachmentMember.issym() and attachmentMember.isfile():
106+
attachments[attachment] = tarFile.extractfile(attachmentMember).read()
107+
108+
parsed["content"] = content
109+
parsed["metadata"] = metadata
110+
parsed["attachments"] = attachments
111+
112+
return parsed

0 commit comments

Comments
 (0)