Skip to content

Commit 11a1f07

Browse files
committed
Fix the utf-8 issue on the unpack interface
1 parent cb8cd47 commit 11a1f07

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

tika/tests/tests_unpack.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import unittest
2+
from tempfile import NamedTemporaryFile
3+
from tika import unpack
4+
5+
6+
class CreateTest(unittest.TestCase):
7+
"Test different encodings"
8+
text_utf8 = "Hello, world!! 😎 👽"
9+
text_ascii = "Hello, world!!"
10+
11+
def test_utf8(self):
12+
with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp', encoding="utf8") as f:
13+
f.write(self.text_utf8)
14+
f.flush()
15+
f.seek(0)
16+
parsed = unpack.from_file(f.name)
17+
self.assertEqual(parsed["content"].strip(), self.text_utf8)
18+
19+
def test_ascii(self):
20+
with NamedTemporaryFile("w+t", prefix='tika-python', suffix='.txt', dir='/tmp', encoding="utf8") as f:
21+
f.write(self.text_ascii)
22+
f.flush()
23+
f.seek(0)
24+
parsed = unpack.from_file(f.name)
25+
self.assertEqual(parsed["content"].strip(), self.text_ascii)
26+
27+
28+
if __name__ == '__main__':
29+
unittest.main()

tika/unpack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def _parse(tarOutput):
9696

9797
contentMember = tarFile.getmember("__TEXT__")
9898
if not contentMember.issym() and contentMember.isfile():
99-
content = _text_wrapper(tarFile.extractfile(contentMember)).read()
99+
content = _text_wrapper(tarFile.extractfile(contentMember), encoding='utf8').read()
100100

101101
# get the remaining files as attachments
102102
attachments = {}

0 commit comments

Comments
 (0)