2020import os
2121import json
2222
23- def from_file (filename , serverEndpoint = ServerEndpoint , xmlContent = False , headers = None , config_path = None , requestOptions = {}):
23+ def from_file (filename , service = 'all' , serverEndpoint = ServerEndpoint , xmlContent = False , headers = None , config_path = None , requestOptions = {}):
2424 '''
2525 Parses a file for metadata and content
2626 :param filename: path to file which needs to be parsed
@@ -33,11 +33,11 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers
3333 'content' has a str value and metadata has a dict type value.
3434 '''
3535 if not xmlContent :
36- jsonOutput = parse1 ('all' , filename , serverEndpoint , headers = headers , config_path = config_path , requestOptions = requestOptions )
36+ output = parse1 (service , filename , serverEndpoint , headers = headers , config_path = config_path , requestOptions = requestOptions )
3737 else :
38- jsonOutput = parse1 ('all' , filename , serverEndpoint , services = {'meta' : '/meta' , 'text' : '/tika' , 'all' : '/rmeta/xml' },
38+ output = parse1 (service , filename , serverEndpoint , services = {'meta' : '/meta' , 'text' : '/tika' , 'all' : '/rmeta/xml' },
3939 headers = headers , config_path = config_path , requestOptions = requestOptions )
40- return _parse (jsonOutput )
40+ return _parse (output , service )
4141
4242
4343def from_buffer (string , serverEndpoint = ServerEndpoint , xmlContent = False , headers = None , config_path = None , requestOptions = {}):
@@ -61,20 +61,35 @@ def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers
6161
6262 return _parse ((status ,response ))
6363
64- def _parse (jsonOutput ):
64+ def _parse (output , service = 'all' ):
6565 '''
66- Parses JSON response from Tika REST API server
67- :param jsonOutput: JSON output from Tika Server
66+ Parses response from Tika REST API server
67+ :param output: output from Tika Server
68+ :param service: service requested from the tika server
69+ Default is 'all', which results in recursive text content+metadata.
70+ 'meta' returns only metadata
71+ 'text' returns only content
6872 :return: a dictionary having 'metadata' and 'content' values
6973 '''
70- parsed = {}
71- if not jsonOutput :
74+ parsed = {'metadata' : None , 'content' : None }
75+ if not output :
7276 return parsed
73-
74- parsed ["status" ] = jsonOutput [0 ]
75- if jsonOutput [1 ] == None or jsonOutput [1 ] == "" :
77+
78+ parsed ["status" ] = output [0 ]
79+ if output [1 ] == None or output [1 ] == "" :
80+ return parsed
81+
82+ if service == "text" :
83+ parsed ["content" ] = output [1 ]
84+ return parsed
85+
86+ realJson = json .loads (output [1 ])
87+
88+ parsed ["metadata" ] = {}
89+ if service == "meta" :
90+ for key in realJson :
91+ parsed ["metadata" ][key ] = realJson [key ]
7692 return parsed
77- realJson = json .loads (jsonOutput [1 ])
7893
7994 content = ""
8095 for js in realJson :
@@ -85,7 +100,6 @@ def _parse(jsonOutput):
85100 content = None
86101
87102 parsed ["content" ] = content
88- parsed ["metadata" ] = {}
89103
90104 for js in realJson :
91105 for n in js :
0 commit comments