Skip to content

Commit 32504f1

Browse files
Merge pull request #171 from yarongon/hotfix/issue-169-uft8-problem
Fix the utf-8 issue on the unpack interface
2 parents cb8cd47 + 24872a9 commit 32504f1

File tree

2 files changed

+39
-4
lines changed

2 files changed

+39
-4
lines changed

tika/tests/tests_unpack.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# coding=utf8
2+
3+
import unittest
4+
from tempfile import NamedTemporaryFile
5+
from tika import unpack
6+
7+
8+
class CreateTest(unittest.TestCase):
9+
"Test different encodings"
10+
text_utf8 = u"Hello, world!! 😎 👽"
11+
text_ascii = u"Hello, world!!"
12+
13+
def test_utf8(self):
14+
with NamedTemporaryFile("w+b", prefix='tika-python', suffix='.txt', dir='/tmp') as f:
15+
f.write(self.text_utf8.encode("utf8"))
16+
f.flush()
17+
f.seek(0)
18+
parsed = unpack.from_file(f.name)
19+
self.assertEqual(parsed["content"].strip(), self.text_utf8)
20+
21+
def test_ascii(self):
22+
with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp') as f:
23+
f.write(self.text_ascii)
24+
f.flush()
25+
f.seek(0)
26+
parsed = unpack.from_file(f.name)
27+
self.assertEqual(parsed["content"].strip(), self.text_ascii)
28+
29+
30+
if __name__ == '__main__':
31+
unittest.main()

tika/unpack.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
# tarfile returned object can be used as is in earlier versions.
2929
_text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
3030

31+
3132
def from_file(filename, serverEndpoint=ServerEndpoint):
3233
'''
3334
Parse from file
@@ -36,7 +37,7 @@ def from_file(filename, serverEndpoint=ServerEndpoint):
3637
:return:
3738
'''
3839
tarOutput = parse1('unpack', filename, serverEndpoint,
39-
responseMimeType='application/x-tar',
40+
responseMimeType='application/x-tar',
4041
services={'meta': '/meta', 'text': '/tika',
4142
'all': '/rmeta/xml', 'unpack': '/unpack/all'},
4243
rawResponse=True)
@@ -52,8 +53,8 @@ def from_buffer(string, serverEndpoint=ServerEndpoint):
5253
'''
5354
status, response = callServer('put', serverEndpoint, '/unpack/all', string,
5455
{'Accept': 'application/x-tar'}, False,
55-
rawResponse=True)
56-
56+
rawResponse=True)
57+
5758
return _parse((status, response))
5859

5960

@@ -96,7 +97,10 @@ def _parse(tarOutput):
9697

9798
contentMember = tarFile.getmember("__TEXT__")
9899
if not contentMember.issym() and contentMember.isfile():
99-
content = _text_wrapper(tarFile.extractfile(contentMember)).read()
100+
if version_info.major >= 3:
101+
content = _text_wrapper(tarFile.extractfile(contentMember), encoding='utf8').read()
102+
else:
103+
content = tarFile.extractfile(contentMember).read().decode('utf8')
100104

101105
# get the remaining files as attachments
102106
attachments = {}

0 commit comments

Comments
 (0)