Skip to content

Commit 2cfe0de

Browse files
Merge pull request #269 from carantunes/parse-from-file-binary
Allow parsing from binary streams
2 parents 45e41b0 + c2a2bac commit 2cfe0de

File tree

4 files changed

+41
-14
lines changed

4 files changed

+41
-14
lines changed

tika/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
def from_file(filename, service='all', serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
2424
'''
2525
Parses a file for metadata and content
26-
:param filename: path to file which needs to be parsed
26+
:param filename: path to file which needs to be parsed or binary file using open(path,'rb')
2727
:param serverEndpoint: Server endpoint url
2828
:param xmlContent: Whether or not XML content be requested.
2929
Default is 'False', which results in text content.

tika/tests/files/rwservlet.pdf

34.4 KB
Binary file not shown.

tika/tests/test_tika.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,32 +15,45 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717
#
18-
#python -m unittest tests.tests
19-
18+
# python -m unittest tests.tests
19+
import os
2020
import unittest
2121
import tika.parser
2222

2323

2424
class CreateTest(unittest.TestCase):
25-
"test for file types"
25+
"""test for file types"""
2626

2727
def test_remote_pdf(self):
28-
'parse remote PDF'
28+
"""parse remote PDF"""
2929
self.assertTrue(tika.parser.from_file(
3030
'http://appsrv.achd.net/reports/rwservlet?food_rep_insp&P_ENCOUNTER=201504160015'))
31+
3132
def test_remote_html(self):
32-
'parse remote HTML'
33-
self.assertTrue(tika.parser.from_file(
34-
'http://neverssl.com/index.html'))
33+
"""parse remote HTML"""
34+
self.assertTrue(tika.parser.from_file('http://neverssl.com/index.html'))
35+
3536
def test_remote_mp3(self):
36-
'parese remote mp3'
37+
"""parse remote mp3"""
3738
self.assertTrue(tika.parser.from_file(
3839
'https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3'))
40+
3941
def test_remote_jpg(self):
40-
'parse remote jpg'
42+
"""parse remote jpg"""
4143
self.assertTrue(tika.parser.from_file(
4244
'https://www.nasa.gov/sites/default/files/thumbnails/image/j2m-shareable.jpg'))
4345

46+
def test_local_binary(self):
47+
"""parse file binary"""
48+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
49+
with open(file, 'rb') as file_obj:
50+
self.assertTrue(tika.parser.from_file(file_obj))
51+
52+
def test_local_path(self):
53+
"""parse file path"""
54+
file = os.path.join(os.path.dirname(__file__), 'files', 'rwservlet.pdf')
55+
self.assertTrue(tika.parser.from_file(file))
56+
4457

4558
if __name__ == '__main__':
4659
unittest.main()

tika/tika.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
5959
6060
'''
61+
import types
6162

6263
USAGE = """
6364
tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
@@ -140,6 +141,7 @@ def make_content_disposition_header(fn):
140141
from subprocess import STDOUT
141142
from os import walk
142143
import logging
144+
import io
143145

144146
log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
145147
log_file = os.path.join(log_path, 'tika.log')
@@ -325,9 +327,9 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
325327
service = services.get(option, services['all'])
326328
if service == '/tika': responseMimeType = 'text/plain'
327329
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
328-
with open(path, 'rb') as f:
330+
with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
329331
status, response = callServer('put', serverEndpoint, service, f,
330-
headers, verbose, tikaServerJar, config_path=config_path,
332+
headers, verbose, tikaServerJar, config_path=config_path,
331333
rawResponse=rawResponse, requestOptions=requestOptions)
332334

333335
if file_type == 'remote': os.unlink(path)
@@ -690,14 +692,26 @@ def toFilename(url):
690692
value = re.sub(r'[^\w\s\.\-]', '-', path).strip().lower()
691693
return re.sub(r'[-\s]+', '-', value).strip("-")[-200:]
692694

693-
695+
696+
def _is_file_object(f):
697+
try:
698+
file_types = (types.FileType, io.IOBase)
699+
except AttributeError:
700+
file_types = (io.IOBase,)
701+
702+
return isinstance(f, file_types)
703+
694704
def getRemoteFile(urlOrPath, destPath):
695705
'''
696706
Fetches URL to local path or just returns absolute path.
697707
:param urlOrPath: resource locator, generally URL or path
698708
:param destPath: path to store the resource, usually a path on file system
699-
:return: tuple having (path, 'local'/'remote')
709+
:return: tuple having (path, 'local'/'remote'/'binary')
700710
'''
711+
# handle binary stream input
712+
if _is_file_object(urlOrPath):
713+
return (urlOrPath.name, 'binary')
714+
701715
urlp = urlparse(urlOrPath)
702716
if urlp.scheme == '':
703717
return (os.path.abspath(urlOrPath), 'local')

0 commit comments

Comments
 (0)