Skip to content

Commit fd44905

Browse files
author
pmc_sreiff
committed
add requestOptions
1 parent 3208269 commit fd44905

File tree

7 files changed

+55
-39
lines changed

7 files changed

+55
-39
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,15 @@ parsed = parser.from_file(home + '/git/geotopicparser-utils/geotopics/polar.geot
210210
print parsed["metadata"]
211211
```
212212

213+
Customizing the Tika Server Request
214+
---------------------------
215+
You may customize the outgoing HTTP request to Tika server by setting `requestOptions` on the `.from_file` and `.from_buffer` methods (Unpack , Detect, Config, Language, Translate). It should be a dictionary of arguments that will be passed to the request method. The [request method documentation](https://2.python-requests.org/en/master/api/#requests.request) specifies valid arguments. This will override any defaults except for `url` and `params `/`data`.
216+
217+
```
218+
from tika import parser
219+
parsed = parser.from_file('/path/to/file', requestOptions={'timeout': 120})
220+
```
221+
213222
New Command Line Client Tool
214223
============================
215224
When you install Tika-Python you also get a new command

tika/detector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@
1818

1919
from .tika import detectType1, callServer, ServerEndpoint
2020

21-
def from_file(filename, config_path=None):
21+
def from_file(filename, config_path=None, requestOptions={}):
2222
'''
2323
Detects MIME type of specified file
2424
:param filename: file whose type needs to be detected
2525
:return: MIME type
2626
'''
27-
jsonOutput = detectType1('type', filename, config_path=config_path)
27+
jsonOutput = detectType1('type', filename, config_path=config_path, requestOptions=requestOptions)
2828
return jsonOutput[1]
2929

30-
def from_buffer(string, config_path=None):
30+
def from_buffer(string, config_path=None, requestOptions={}):
3131
'''
3232
Detects MIME type of the buffered content
3333
:param string: buffered content whose type needs to be detected
3434
:return:
3535
'''
3636
status, response = callServer('put', ServerEndpoint, '/detect/stream', string,
37-
{'Accept': 'text/plain'}, False, config_path=config_path)
37+
{'Accept': 'text/plain'}, False, config_path=config_path, requestOptions=requestOptions)
3838
return response

tika/language.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@
1818

1919
from .tika import detectLang1, callServer, ServerEndpoint
2020

21-
def from_file(filename):
21+
def from_file(filename, requestOptions={}):
2222
'''
2323
Detects language of the file
2424
:param filename: path to file whose language needs to be detected
2525
:return:
2626
'''
27-
jsonOutput = detectLang1('file', filename)
27+
jsonOutput = detectLang1('file', filename, requestOptions=requestOptions)
2828
return jsonOutput[1]
2929

30-
def from_buffer(string):
30+
def from_buffer(string, requestOptions={}):
3131
'''
3232
Detects language of content in the buffer
3333
:param string: buffered data
3434
:return:
3535
'''
3636
status, response = callServer('put', ServerEndpoint, '/language/string', string,
37-
{'Accept': 'text/plain'}, False)
37+
{'Accept': 'text/plain'}, False, requestOptions=requestOptions)
3838
return response

tika/parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import os
2121
import json
2222

23-
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
23+
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
2424
'''
2525
Parses a file for metadata and content
2626
:param filename: path to file which needs to be parsed
@@ -33,14 +33,14 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers
3333
'content' has a str value and metadata has a dict type value.
3434
'''
3535
if not xmlContent:
36-
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
36+
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
3737
else:
3838
jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
39-
headers=headers, config_path=config_path)
39+
headers=headers, config_path=config_path, requestOptions=requestOptions)
4040
return _parse(jsonOutput)
4141

4242

43-
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
43+
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}):
4444
'''
4545
Parses the content from buffer
4646
:param string: Buffer value
@@ -55,9 +55,9 @@ def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers
5555
headers.update({'Accept': 'application/json'})
5656

5757
if not xmlContent:
58-
status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path)
58+
status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path, requestOptions=requestOptions)
5959
else:
60-
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path)
60+
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path, requestOptions=requestOptions)
6161

6262
return _parse((status,response))
6363

tika/tika.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
301301

302302
def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
303303
responseMimeType='application/json',
304-
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None):
304+
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None, requestOptions={}):
305305
'''
306306
Parse the object and return extracted metadata and/or text in JSON format.
307307
:param option:
@@ -326,7 +326,7 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
326326
if service == '/tika': responseMimeType = 'text/plain'
327327
headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path)})
328328
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
329-
headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
329+
headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse, requestOptions=requestOptions)
330330

331331
if file_type == 'remote': os.unlink(path)
332332
return (status, response)
@@ -351,7 +351,7 @@ def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos
351351

352352
def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
353353
responseMimeType='text/plain',
354-
services={'file' : '/language/stream'}):
354+
services={'file' : '/language/stream'}, requestOptions={}):
355355
'''
356356
Detect the language of the provided stream and return its 2 character code as text/plain.
357357
:param option:
@@ -369,7 +369,7 @@ def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos
369369
raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
370370
service = services[option]
371371
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
372-
{'Accept': responseMimeType}, verbose, tikaServerJar)
372+
{'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
373373
return (status, response)
374374

375375
def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
@@ -392,7 +392,7 @@ def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbo
392392

393393
def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
394394
responseMimeType='text/plain',
395-
services={'all': '/translate/all'}):
395+
services={'all': '/translate/all'}, requestOptions={}):
396396
'''
397397
398398
:param option:
@@ -424,7 +424,7 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo
424424
service = services["all"] + "/" + Translator + "/" + destLang
425425
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
426426
{'Accept' : responseMimeType},
427-
verbose, tikaServerJar)
427+
verbose, tikaServerJar, requestOptions=requestOptions)
428428
return (status, response)
429429

430430
def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
@@ -447,7 +447,7 @@ def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos
447447

448448
def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
449449
responseMimeType='text/plain',
450-
services={'type': '/detect/stream'}, config_path=None):
450+
services={'type': '/detect/stream'}, config_path=None, requestOptions={}):
451451
'''
452452
Detect the MIME/media type of the stream and return it in text/plain.
453453
:param option:
@@ -469,14 +469,14 @@ def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos
469469
'Accept': responseMimeType,
470470
'Content-Disposition': make_content_disposition_header(path)
471471
},
472-
verbose, tikaServerJar, config_path=config_path)
472+
verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions)
473473
if csvOutput == 1:
474474
return(status, urlOrPath.decode("UTF-8") + "," + response)
475475
else:
476476
return (status, response)
477477

478478
def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
479-
services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}):
479+
services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}, requestOptions={}):
480480
'''
481481
Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
482482
:param option:
@@ -490,13 +490,12 @@ def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServer
490490
if option not in services:
491491
die('config option must be one of mime-types, detectors, or parsers')
492492
service = services[option]
493-
status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar)
493+
status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
494494
return (status, response)
495495

496-
497496
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
498497
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
499-
rawResponse=False,config_path=None):
498+
rawResponse=False,config_path=None, requestOptions={}):
500499
'''
501500
Call the Tika Server, do some error checking, and return the response.
502501
:param verb:
@@ -535,7 +534,15 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti
535534
if type(data) is unicode_string:
536535
encodedData = data.encode('utf-8')
537536

538-
resp = verbFn(serviceUrl, encodedData, headers=headers, verify=False)
537+
requestOptionsDefault = {
538+
'timeout': 60,
539+
'headers': headers,
540+
'verify': False
541+
}
542+
effectiveRequestOptions = requestOptionsDefault.copy()
543+
effectiveRequestOptions.update(requestOptions)
544+
545+
resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
539546
if verbose:
540547
print(sys.stderr, "Request headers: ", headers)
541548
print(sys.stderr, "Response headers: ", resp.headers)

tika/translate.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from .tika import doTranslate1, callServer, Translator, ServerEndpoint
2020

21-
def from_file(filename, srcLang, destLang, serverEndpoint=ServerEndpoint):
21+
def from_file(filename, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}):
2222
'''
2323
Traslates the content of source file to destination language
2424
:param filename: file whose contents needs translation
@@ -27,10 +27,10 @@ def from_file(filename, srcLang, destLang, serverEndpoint=ServerEndpoint):
2727
:param serverEndpoint: Tika server end point (Optional)
2828
:return: translated content
2929
'''
30-
jsonOutput = doTranslate1(srcLang+':'+destLang, filename, serverEndpoint)
30+
jsonOutput = doTranslate1(srcLang+':'+destLang, filename, serverEndpoint, requestOptions=requestOptions)
3131
return jsonOutput[1]
3232

33-
def from_buffer(string, srcLang, destLang, serverEndpoint=ServerEndpoint):
33+
def from_buffer(string, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}):
3434
'''
3535
Translates content from source language to desired destination language
3636
:param string: input content which needs translation
@@ -40,21 +40,21 @@ def from_buffer(string, srcLang, destLang, serverEndpoint=ServerEndpoint):
4040
:return:
4141
'''
4242
status, response = callServer('put', ServerEndpoint, '/translate/all/'+Translator+'/'+srcLang+'/'+destLang,
43-
string, {'Accept': 'text/plain'}, False)
43+
string, {'Accept': 'text/plain'}, False, requestOptions=requestOptions)
4444
return response
4545

46-
def auto_from_file(filename, destLang, serverEndpoint=ServerEndpoint):
46+
def auto_from_file(filename, destLang, serverEndpoint=ServerEndpoint, requestOptions={}):
4747
'''
4848
Translates contents of a file to desired language by auto detecting the source language
4949
:param filename: file whose contents needs translation
5050
:param destLang: name of the desired language for translation
5151
:param serverEndpoint: Tika server end point (Optional)
5252
:return:
5353
'''
54-
jsonOutput = doTranslate1(destLang, filename, serverEndpoint)
54+
jsonOutput = doTranslate1(destLang, filename, serverEndpoint, requestOptions=requestOptions)
5555
return jsonOutput[1]
5656

57-
def auto_from_buffer(string, destLang, serverEndpoint=ServerEndpoint):
57+
def auto_from_buffer(string, destLang, serverEndpoint=ServerEndpoint, requestOptions={}):
5858
'''
5959
Translates content to desired language by auto detecting the source language
6060
:param string: input content which needs translation
@@ -63,6 +63,6 @@ def auto_from_buffer(string, destLang, serverEndpoint=ServerEndpoint):
6363
:return:
6464
'''
6565
status, response = callServer('put', ServerEndpoint, '/translate/all/'+Translator+'/'+destLang,
66-
string, {'Accept': 'text/plain'}, False)
66+
string, {'Accept': 'text/plain'}, False, requestOptions=requestOptions)
6767
return response
6868

tika/unpack.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
_text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
3030

3131

32-
def from_file(filename, serverEndpoint=ServerEndpoint):
32+
def from_file(filename, serverEndpoint=ServerEndpoint, requestOptions={}):
3333
'''
3434
Parse from file
3535
:param filename: file
@@ -40,11 +40,11 @@ def from_file(filename, serverEndpoint=ServerEndpoint):
4040
responseMimeType='application/x-tar',
4141
services={'meta': '/meta', 'text': '/tika',
4242
'all': '/rmeta/xml', 'unpack': '/unpack/all'},
43-
rawResponse=True)
43+
rawResponse=True, requestOptions=requestOptions)
4444
return _parse(tarOutput)
4545

4646

47-
def from_buffer(string, serverEndpoint=ServerEndpoint):
47+
def from_buffer(string, serverEndpoint=ServerEndpoint, requestOptions={}):
4848
'''
4949
Parse from buffered content
5050
:param string: buffered content
@@ -53,7 +53,7 @@ def from_buffer(string, serverEndpoint=ServerEndpoint):
5353
'''
5454
status, response = callServer('put', serverEndpoint, '/unpack/all', string,
5555
{'Accept': 'application/x-tar'}, False,
56-
rawResponse=True)
56+
rawResponse=True, requestOptions=requestOptions)
5757

5858
return _parse((status, response))
5959

0 commit comments

Comments
 (0)