Skip to content

Commit bcf2df1

Browse files
author
akdidier
committed
updated to v1.19 and added custom config
1 parent f402e65 commit bcf2df1

File tree

3 files changed

+38
-23
lines changed

3 files changed

+38
-23
lines changed

tika/detector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@
1818

1919
from .tika import detectType1, callServer, ServerEndpoint
2020

21-
def from_file(filename):
21+
def from_file(filename, config_path=None):
2222
'''
2323
Detects MIME type of specified file
2424
:param filename: file whose type needs to be detected
2525
:return: MIME type
2626
'''
27-
jsonOutput = detectType1('type', filename)
27+
jsonOutput = detectType1('type', filename, config_path=config_path)
2828
return jsonOutput[1]
2929

30-
def from_buffer(string):
30+
def from_buffer(string, config_path=None):
3131
'''
3232
Detects MIME type of the buffered content
3333
:param string: buffered content whose type needs to be detected
3434
:return:
3535
'''
3636
status, response = callServer('put', ServerEndpoint, '/detect/stream', string,
37-
{'Accept': 'text/plain'}, False)
37+
{'Accept': 'text/plain'}, False, config_path=config_path)
3838
return response

tika/parser.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import os
2121
import json
2222

23-
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None):
23+
def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
2424
'''
2525
Parses a file for metadata and content
2626
:param filename: path to file which needs to be parsed
@@ -33,13 +33,14 @@ def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers
3333
'content' has a str value and metadata has a dict type value.
3434
'''
3535
if not xmlContent:
36-
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers)
36+
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
3737
else:
38-
jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'}, headers=headers)
38+
jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
39+
headers=headers, config_path=config_path)
3940
return _parse(jsonOutput)
4041

4142

42-
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None):
43+
def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
4344
'''
4445
Parses the content from buffer
4546
:param string: Buffer value
@@ -54,9 +55,9 @@ def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers
5455
headers.update({'Accept': 'application/json'})
5556

5657
if not xmlContent:
57-
status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False)
58+
status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path)
5859
else:
59-
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False)
60+
status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path)
6061

6162
return _parse((status,response))
6263

tika/tika.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@
4747
# Use auto Language detection feature
4848
print(translate.from_file('/path/to/file', 'destLang')
4949
50+
***Tika-Python Configuration***
51+
You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
52+
for details on writing configuration files. Configuration is set the first time the server is started.
53+
To use a configuration file with a parser, or detector:
54+
parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
55+
or:
56+
detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
57+
or:
58+
detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
5059
5160
'''
5261

@@ -152,7 +161,7 @@ def make_content_disposition_header(fn):
152161
log.setLevel(logging.INFO)
153162

154163
Windows = True if platform.system() == "Windows" else False
155-
TikaVersion = os.getenv('TIKA_VERSION', '1.18')
164+
TikaVersion = os.getenv('TIKA_VERSION', '1.19')
156165
TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
157166
TikaFilesPath = tempfile.gettempdir()
158167
TikaServerLogFilePath = log_path
@@ -292,7 +301,7 @@ def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
292301

293302
def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
294303
responseMimeType='application/json',
295-
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None):
304+
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None):
296305
'''
297306
Parse the object and return extracted metadata and/or text in JSON format.
298307
:param option:
@@ -316,7 +325,7 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
316325
service = services.get(option, services['all'])
317326
if service == '/tika': responseMimeType = 'text/plain'
318327
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
319-
headers, verbose, tikaServerJar, rawResponse=rawResponse)
328+
headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
320329

321330
if file_type == 'remote': os.unlink(path)
322331
return (status, response)
@@ -437,7 +446,7 @@ def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos
437446

438447
def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
439448
responseMimeType='text/plain',
440-
services={'type': '/detect/stream'}):
449+
services={'type': '/detect/stream'}, config_path=None):
441450
'''
442451
Detect the MIME/media type of the stream and return it in text/plain.
443452
:param option:
@@ -459,7 +468,7 @@ def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos
459468
'Accept': responseMimeType,
460469
'Content-Disposition': make_content_disposition_header(path)
461470
},
462-
verbose, tikaServerJar)
471+
verbose, tikaServerJar, config_path=config_path)
463472
if csvOutput == 1:
464473
return(status, urlOrPath.decode("UTF-8") + "," + response)
465474
else:
@@ -486,7 +495,7 @@ def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServer
486495

487496
def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
488497
httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
489-
rawResponse=False):
498+
rawResponse=False,config_path=None):
490499
'''
491500
Call the Tika Server, do some error checking, and return the response.
492501
:param verb:
@@ -510,14 +519,14 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti
510519

511520
global TikaClientOnly
512521
if not TikaClientOnly:
513-
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath)
522+
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
514523

515524
serviceUrl = serverEndpoint + service
516525
if verb not in httpVerbs:
517526
log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
518527
raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
519528
verbFn = httpVerbs[verb]
520-
529+
521530
if Windows and hasattr(data, "read"):
522531
data = data.read()
523532

@@ -539,7 +548,7 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti
539548
return (resp.status_code, resp.text)
540549

541550

542-
def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None):
551+
def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
543552
'''
544553
Check that tika-server is running. If not, download JAR file and start it up.
545554
:param scheme: e.g. http or https
@@ -565,7 +574,7 @@ def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJ
565574
os.remove(jarPath)
566575
tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
567576

568-
status = startServer(jarPath, TikaJava, serverHost, port, classpath)
577+
status = startServer(jarPath, TikaJava, serverHost, port, classpath, config_path)
569578
if not status:
570579
log.error("Failed to receive startup confirmation from startServer.")
571580
raise RuntimeError("Unable to start Tika server.")
@@ -589,7 +598,7 @@ def checkJarSig(tikaServerJar, jarPath):
589598
return existingContents == m.hexdigest()
590599

591600

592-
def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, port = Port, classpath=None):
601+
def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
593602
'''
594603
Starts Tika Server
595604
:param tikaServerJar: path to tika server jar
@@ -611,8 +620,13 @@ def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, po
611620
classpath = tikaServerJar
612621

613622
# setup command string
614-
cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %i --host %s &' \
615-
% (java_path, classpath, port, host)
623+
cmd_string = ""
624+
if not config_path:
625+
cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %i --host %s &' \
626+
% (java_path, classpath, port, host)
627+
else:
628+
cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %i --host %s --config %s &' \
629+
% (java_path, classpath, port, host, config_path)
616630

617631
# Check that we can write to log path
618632
try:

0 commit comments

Comments
 (0)