Skip to content

Commit ce7d966

Browse files
committed
Fix for #388 'charmap' codec can't decode byte 0x81 in position 279: character maps to <undefined>
1 parent d189422 commit ce7d966

File tree

1 file changed

+27
-12
lines changed

1 file changed

+27
-12
lines changed

tika/unpack.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,33 @@ def _parse(tarOutput):
8181

8282
metadataMember = tarFile.getmember("__METADATA__")
8383
if not metadataMember.issym() and metadataMember.isfile():
84-
with closing(_text_wrapper(tarFile.extractfile(metadataMember))) as metadataFile:
85-
metadataReader = csv.reader(_truncate_nulls(metadataFile))
86-
for metadataLine in metadataReader:
87-
# each metadata line comes as a key-value pair, with list values
88-
# returned as extra values in the line - convert single values
89-
# to non-list values to be consistent with parser metadata
90-
assert len(metadataLine) >= 2
91-
92-
if len(metadataLine) > 2:
93-
metadata[metadataLine[0]] = metadataLine[1:]
94-
else:
95-
metadata[metadataLine[0]] = metadataLine[1]
84+
if version_info.major >= 3:
85+
with closing(_text_wrapper(tarFile.extractfile(metadataMember), encoding=tarFile.encoding)) as metadataFile:
86+
metadataReader = csv.reader(_truncate_nulls(metadataFile))
87+
for metadataLine in metadataReader:
88+
# each metadata line comes as a key-value pair, with list values
89+
# returned as extra values in the line - convert single values
90+
# to non-list values to be consistent with parser metadata
91+
assert len(metadataLine) >= 2
92+
93+
if len(metadataLine) > 2:
94+
metadata[metadataLine[0]] = metadataLine[1:]
95+
else:
96+
metadata[metadataLine[0]] = metadataLine[1]
97+
else:
98+
with closing(_text_wrapper(tarFile.extractfile(metadataMember))) as metadataFile:
99+
metadataReader = csv.reader(_truncate_nulls(metadataFile))
100+
for metadataLine in metadataReader:
101+
# each metadata line comes as a key-value pair, with list values
102+
# returned as extra values in the line - convert single values
103+
# to non-list values to be consistent with parser metadata
104+
assert len(metadataLine) >= 2
105+
106+
if len(metadataLine) > 2:
107+
metadata[metadataLine[0]] = metadataLine[1:]
108+
else:
109+
metadata[metadataLine[0]] = metadataLine[1]
110+
96111

97112
# get the content
98113
content = ""

0 commit comments

Comments
 (0)