Skip to content

Commit dd77186

Browse files
Merge pull request #259 from jjelosua/master
expose service in the from_file parser
2 parents 27b24ed + f3625ba commit dd77186

File tree

2 files changed

+87
-14
lines changed

2 files changed

+87
-14
lines changed

tika/parser.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import os
2121
import json
2222

23-
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
23+
def from_file(filename, service='all', serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
2424
'''
2525
Parses a file for metadata and content
2626
:param filename: path to file which needs to be parsed
@@ -33,11 +33,11 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers
3333
'content' has a str value and metadata has a dict type value.
3434
'''
3535
if not xmlContent:
36-
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
36+
output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
3737
else:
38-
jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
38+
output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
3939
headers=headers, config_path=config_path, requestOptions=requestOptions)
40-
return _parse(jsonOutput)
40+
return _parse(output, service)
4141

4242

4343
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
@@ -61,20 +61,35 @@ def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers
6161

6262
return _parse((status,response))
6363

64-
def _parse(jsonOutput):
64+
def _parse(output, service='all'):
6565
'''
66-
Parses JSON response from Tika REST API server
67-
:param jsonOutput: JSON output from Tika Server
66+
Parses response from Tika REST API server
67+
:param output: output from Tika Server
68+
:param service: service requested from the tika server
69+
Default is 'all', which results in recursive text content+metadata.
70+
'meta' returns only metadata
71+
'text' returns only content
6872
:return: a dictionary having 'metadata' and 'content' values
6973
'''
70-
parsed={}
71-
if not jsonOutput:
74+
parsed={'metadata': None, 'content': None}
75+
if not output:
7276
return parsed
73-
74-
parsed["status"] = jsonOutput[0]
75-
if jsonOutput[1] == None or jsonOutput[1] == "":
77+
78+
parsed["status"] = output[0]
79+
if output[1] == None or output[1] == "":
80+
return parsed
81+
82+
if service == "text":
83+
parsed["content"] = output[1]
84+
return parsed
85+
86+
realJson = json.loads(output[1])
87+
88+
parsed["metadata"] = {}
89+
if service == "meta":
90+
for key in realJson:
91+
parsed["metadata"][key] = realJson[key]
7692
return parsed
77-
realJson = json.loads(jsonOutput[1])
7893

7994
content = ""
8095
for js in realJson:
@@ -85,7 +100,6 @@ def _parse(jsonOutput):
85100
content = None
86101

87102
parsed["content"] = content
88-
parsed["metadata"] = {}
89103

90104
for js in realJson:
91105
for n in js:
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# python -m unittest tika.tests.test_from_file_service
19+
20+
import unittest
21+
import tika.parser
22+
23+
24+
class CreateTest(unittest.TestCase):
25+
'test different services in from_file parsing: Content, Metadata or both in recursive mode'
26+
27+
def test_default_service(self):
28+
'parse file using default service'
29+
result = tika.parser.from_file(
30+
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf')
31+
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
32+
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
33+
def test_default_service_explicit(self):
34+
'parse file using default service explicitly'
35+
result = tika.parser.from_file(
36+
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='all')
37+
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
38+
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
39+
def test_text_service(self):
40+
'parse file using the content only service'
41+
result = tika.parser.from_file(
42+
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='text')
43+
self.assertIsNone(result['metadata'])
44+
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
45+
def test_meta_service(self):
46+
'parse file using the content only service'
47+
result = tika.parser.from_file(
48+
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='meta')
49+
self.assertIsNone(result['content'])
50+
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
51+
def test_invalid_service(self):
52+
'parse file using an invalid service should perform the default parsing'
53+
result = tika.parser.from_file(
54+
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='bad')
55+
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
56+
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
57+
58+
if __name__ == '__main__':
59+
unittest.main()

0 commit comments

Comments
 (0)