Skip to content

Commit 7f65c3e

Browse files
committed
Merge branch 'rmeta-allow-gzip' of https://github.com/carantunes/tika-python into merge-pr316
2 parents 49ab87c + a183472 commit 7f65c3e

File tree

9 files changed

+309
-8
lines changed

9 files changed

+309
-8
lines changed

.travis.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
language: python
2+
23
python:
34
- "2.7"
45
- "3.4"
56
- "3.5"
67
- "3.7"
7-
install: "pip install -r requirements.txt"
8+
9+
install:
10+
- "travis_retry pip install -r requirements.txt"
11+
- "travis_retry pip install -e .[all]"
12+
813
script: py.test --cov=tika
14+
915
after_success: coveralls

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,37 @@ parsed = parser.from_file('/path/to/file', 'http://tika:9998/tika')
9292
string_parsed = parser.from_buffer('Good evening, Dave', 'http://tika:9998/tika')
9393
```
9494

95+
You can also pass a binary stream
96+
```
97+
with open(file, 'rb') as file_obj:
98+
response = tika.parser.from_file(file_obj)
99+
```
100+
101+
Gzip compression
102+
---------------------
103+
Since Tika 1.24.1 gzip compression of input and output streams is allowed.
104+
105+
Input compression can be achieved with gzip or zlib:
106+
```
107+
import zlib
108+
109+
with open(file, 'rb') as file_obj:
110+
return tika.parser.from_buffer(zlib.compress(file_obj.read()))
111+
112+
...
113+
114+
import gzip
115+
116+
with open(file, 'rb') as file_obj:
117+
return tika.parser.from_buffer(gzip.compress(file_obj.read()))
118+
```
119+
120+
And output with the header:
121+
```
122+
with open(file, 'rb') as file_obj:
123+
return tika.parser.from_file(file_obj, headers={'Accept-Encoding': 'gzip, deflate'})
124+
```
125+
95126
Specify Output Format To XHTML
96127
---------------------
97128
The parser interface is optionally able to output the content as XHTML rather than plain text.

setup.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919
# $Id$
2020

2121
import os.path
22-
import tika
2322
from io import open
2423

24+
import tika
25+
2526
try:
2627
from ez_setup import use_setuptools
28+
2729
use_setuptools()
2830
except ImportError:
2931
pass
@@ -53,11 +55,25 @@
5355
'Topic :: Software Development :: Libraries :: Python Modules',
5456
]
5557

58+
5659
def read(*rnames):
5760
return open(os.path.join(os.path.dirname(__file__), *rnames)).read()
5861

62+
5963
long_description = _descr
6064

65+
extras_require = {
66+
'tests': [
67+
'memory-profiler>=0.57.0',
68+
'pytest-benchmark>=3.2.2'
69+
],
70+
'all': [
71+
]
72+
}
73+
74+
for reqs in extras_require.values():
75+
extras_require['all'].extend(reqs)
76+
6177
setup(
6278
name='tika',
6379
version=version,
@@ -80,14 +96,13 @@ def read(*rnames):
8096
'tika-python = tika.tika:main'
8197
],
8298
},
83-
package_data = {
99+
package_data={
84100
# And include any *.conf files found in the 'conf' subdirectory
85101
# for the package
86102
},
87103
install_requires=[
88104
'setuptools',
89105
'requests'
90106
],
91-
extras_require={
92-
},
107+
extras_require=extras_require,
93108
)

tika/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
__version__ = "1.24"
17+
__version__ = "1.24.1"
1818

1919
try:
2020
__import__('pkg_resources').declare_namespace(__name__)

tika/tests/memory_benchmark.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# To run:
19+
# python tika/tests/memory_benchmark.py
20+
import os
21+
import zlib
22+
23+
24+
import tika.parser
25+
import tika.tika
26+
from memory_profiler import profile
27+
28+
from tika.tests.utils import gzip_compress
29+
30+
31+
@profile
32+
def test_parser_binary():
33+
"""parse file binary"""
34+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
35+
36+
with open(file, 'rb') as file_obj:
37+
response = tika.parser.from_file(file_obj, headers={'Accept-Encoding': 'gzip, deflate'})
38+
39+
40+
@profile
41+
def test_parser_buffer():
42+
"""parse buffer"""
43+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
44+
45+
with open(file, 'rb') as file_obj:
46+
response = tika.parser.from_buffer(file_obj.read(), headers={'Accept-Encoding': 'gzip, deflate'})
47+
48+
49+
@profile
50+
def test_parser_zlib():
51+
"""parse buffer zlib"""
52+
53+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
54+
55+
with open(file, 'rb') as file_obj:
56+
response = tika.parser.from_buffer(zlib.compress(file_obj.read()), headers={'Accept-Encoding': 'gzip, deflate'})
57+
58+
59+
@profile
60+
def test_parser_gzip():
61+
"""parse buffer gzip"""
62+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
63+
64+
with open(file, 'rb') as file_obj:
65+
response = tika.parser.from_buffer(gzip_compress(file_obj.read()), headers={'Accept-Encoding': 'gzip, deflate'})
66+
67+
if __name__ == '__main__':
68+
test_parser_buffer()
69+
test_parser_binary()
70+
test_parser_zlib()
71+
test_parser_gzip()

tika/tests/test_benchmark.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py
19+
# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py
20+
import os
21+
import unittest
22+
import zlib
23+
24+
import tika.parser
25+
import tika.tika
26+
from tika.tests.utils import HTTPStatusOk, gzip_compress
27+
28+
29+
def test_local_binary(benchmark):
30+
"""parse file binary"""
31+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
32+
response = benchmark(tika_from_binary, file)
33+
34+
assert response['status'] == HTTPStatusOk
35+
36+
37+
def test_parser_buffer(benchmark):
38+
"""example how to send gzip file"""
39+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
40+
response = benchmark(tika_from_buffer, file)
41+
42+
assert response['status'] == HTTPStatusOk
43+
44+
45+
def test_parser_buffer_zlib_input(benchmark):
46+
"""example how to send gzip file"""
47+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
48+
49+
response = benchmark(tika_from_buffer_zlib, file)
50+
51+
assert response['status'] == HTTPStatusOk
52+
53+
54+
def test_parser_buffer_gzip_input(benchmark):
55+
"""parse file binary"""
56+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
57+
response = benchmark(tika_from_buffer_gzip, file)
58+
59+
assert response['status'] == HTTPStatusOk
60+
61+
62+
def test_local_binary_with_gzip_output(benchmark):
63+
"""parse file binary"""
64+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
65+
response = benchmark(tika_from_binary, file, headers={'Accept-Encoding': 'gzip, deflate'})
66+
67+
assert response['status'] == HTTPStatusOk
68+
69+
70+
def test_parser_buffer_with_gzip_output(benchmark):
71+
"""example how to send gzip file"""
72+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
73+
response = benchmark(tika_from_buffer, file, headers={'Accept-Encoding': 'gzip, deflate'})
74+
75+
assert response['status'] == HTTPStatusOk
76+
77+
78+
def test_parser_buffer_zlib_input_and_gzip_output(benchmark):
79+
"""example how to send gzip file"""
80+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
81+
82+
response = benchmark(tika_from_buffer_zlib, file, headers={'Accept-Encoding': 'gzip, deflate'})
83+
84+
assert response['status'] == HTTPStatusOk
85+
86+
87+
def test_parser_buffer_gzip_input_and_gzip_output(benchmark):
88+
"""parse file binary"""
89+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
90+
response = benchmark(tika_from_buffer_gzip, file, headers={'Accept-Encoding': 'gzip, deflate'})
91+
92+
assert response['status'] == HTTPStatusOk
93+
94+
95+
def tika_from_buffer_zlib(file, headers=None):
96+
with open(file, 'rb') as file_obj:
97+
return tika.parser.from_buffer(zlib.compress(file_obj.read()), headers=headers)
98+
99+
100+
def tika_from_buffer_gzip(file, headers=None):
101+
with open(file, 'rb') as file_obj:
102+
return tika.parser.from_buffer(gzip_compress(file_obj.read()), headers=headers)
103+
104+
105+
def tika_from_buffer(file, headers=None):
106+
with open(file, 'rb') as file_obj:
107+
return tika.parser.from_buffer(file_obj.read(), headers=headers)
108+
109+
110+
def tika_from_binary(file, headers=None):
111+
with open(file, 'rb') as file_obj:
112+
return tika.parser.from_file(file_obj, headers=headers)
113+
114+
115+
if __name__ == '__main__':
116+
unittest.main()

tika/tests/test_tika.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
# python -m unittest tests.tests
1919
import os
2020
import unittest
21+
2122
import tika.parser
2223
import tika.tika
24+
from tika.tests.utils import HTTPStatusOk
2325

2426

2527
class CreateTest(unittest.TestCase):
@@ -50,6 +52,10 @@ def test_local_binary(self):
5052
with open(file, 'rb') as file_obj:
5153
self.assertTrue(tika.parser.from_file(file_obj))
5254

55+
def test_local_buffer(self):
56+
response = tika.parser.from_buffer('Good evening, Dave')
57+
self.assertEqual(response['status'], HTTPStatusOk)
58+
5359
def test_local_path(self):
5460
"""parse file path"""
5561
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
@@ -62,5 +68,6 @@ def test_kill_server(self):
6268
tika.parser.from_file(file_obj)
6369
self.assertIsNone(tika.tika.killServer())
6470

71+
6572
if __name__ == '__main__':
6673
unittest.main()

tika/tests/utils.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
import gzip
18+
19+
20+
def HTTPStatusOk():
21+
try:
22+
# python 2.7
23+
import httplib
24+
25+
return httplib.OK
26+
except ImportError:
27+
try:
28+
# python > 3.4
29+
from http import HTTPStatus
30+
31+
return HTTPStatus.OK
32+
except ImportError:
33+
# python 3.4
34+
import http.client
35+
36+
return http.client.OK
37+
38+
39+
HTTPStatusOk = HTTPStatusOk()
40+
41+
42+
def gzip_compress(file_obj):
43+
try:
44+
# python > 3.4
45+
return gzip.compress(file_obj)
46+
except AttributeError:
47+
# python 2.7
48+
import StringIO
49+
out = StringIO.StringIO()
50+
gzip_s = gzip.GzipFile(fileobj=out, mode="wb")
51+
gzip_s.write(file_obj.encode('utf-8'))
52+
gzip_s.close()
53+
54+
# Get the bytes written to the underlying file object
55+
return out.getvalue()

0 commit comments

Comments
 (0)