Skip to content

Commit da7bbbd

Browse files
Merge pull request #186 from mjbommar/feature-server-run-checks
PR re: issue #113
2 parents 111ead6 + 0df71d1 commit da7bbbd

File tree

2 files changed

+58
-13
lines changed

2 files changed

+58
-13
lines changed

README.md

100644100755
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ These are read once, when tika/tika.py is initially loaded and used throughout a
3737
6. `TIKA_SERVER_CLASSPATH` - set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
3838
7. `TIKA_LOG_PATH` - set to a directory with write permissions and the `tika.log` and `tika-server.log` files will be placed in this directory.
3939
8. `TIKA_PATH` - set to a directory with write permissions and the `tika_server.jar` file will be placed in this directory.
40+
9. `TIKA_JAVA` - set the Java runtime name, e.g., `java` or `java9`
41+
10. `TIKA_STARTUP_SLEEP` - number of seconds (`float`) to wait per check if Tika server is launched at runtime
42+
11. `TIKA_STARTUP_MAX_RETRY` - number of checks (`int`) to attempt for Tika server startup if launched at runtime
4043

4144
Testing it out
4245
==============

tika/tika.py

100644100755
Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ def make_content_disposition_header(fn):
168168
"org.apache.tika.language.translate.Lingo24Translator")
169169
TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
170170
TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
171+
TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
172+
TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
173+
TikaJava = os.getenv("TIKA_JAVA", "java")
171174

172175
Verbose = 0
173176
EncodeUtf8 = 0
@@ -553,16 +556,19 @@ def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJ
553556
jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
554557
if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
555558
alreadyRunning = checkPortIsOpen(serverHost, port)
556-
559+
557560
if not alreadyRunning:
558561
if not os.path.isfile(jarPath) and urlp.scheme != '':
559-
getRemoteJar(tikaServerJar, jarPath)
560-
562+
getRemoteJar(tikaServerJar, jarPath)
563+
561564
if not checkJarSig(tikaServerJar, jarPath):
562565
os.remove(jarPath)
563566
tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
564-
565-
startServer(jarPath, serverHost, port, classpath)
567+
568+
status = startServer(jarPath, TikaJava, serverHost, port, classpath)
569+
if not status:
570+
log.error("Failed to receive startup confirmation from startServer.")
571+
raise RuntimeError("Unable to start Tika server.")
566572
return serverEndpoint
567573

568574
def checkJarSig(tikaServerJar, jarPath):
@@ -583,7 +589,7 @@ def checkJarSig(tikaServerJar, jarPath):
583589
return existingContents == m.hexdigest()
584590

585591

586-
def startServer(tikaServerJar, serverHost = ServerHost, port = Port, classpath=None):
592+
def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, port = Port, classpath=None):
587593
'''
588594
Starts Tika Server
589595
:param tikaServerJar: path to tika server jar
@@ -594,20 +600,56 @@ def startServer(tikaServerJar, serverHost = ServerHost, port = Port, classpath=N
594600
'''
595601
if classpath is None:
596602
classpath = TikaServerClasspath
597-
603+
598604
host = "localhost"
599605
if Windows:
600606
host = "0.0.0.0"
601-
607+
602608
if classpath:
603609
classpath += ":" + tikaServerJar
604610
else:
605611
classpath = tikaServerJar
606-
607-
cmd = 'java -cp %s org.apache.tika.server.TikaServerCli --port %i --host %s &' % (classpath, port, host)
608-
logFile = open(os.path.join(TikaServerLogFilePath, 'tika-server.log'), 'w')
609-
cmd = Popen(cmd , stdout= logFile, stderr = STDOUT, shell =True)
610-
time.sleep(5)
612+
613+
# setup command string
614+
cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %i --host %s &' \
615+
% (java_path, classpath, port, host)
616+
617+
# Check that we can write to log path
618+
try:
619+
tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
620+
logFile = open(tika_log_file_path, 'w')
621+
except PermissionError as e:
622+
log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
623+
return False
624+
625+
# Check that specified java binary is available on path
626+
try:
627+
_ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
628+
except FileNotFoundError as e:
629+
log.error("Unable to run java; is it installed?")
630+
return False
631+
632+
# Run java with jar args
633+
cmd = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
634+
635+
# Check logs and retry as configured
636+
try_count = 0
637+
is_started = False
638+
while try_count < TikaStartupMaxRetry:
639+
with open(tika_log_file_path, "r") as tika_log_file_tmp:
640+
# check for INFO string to confirm listening endpoint
641+
if "Started Apache Tika server at" in tika_log_file_tmp.read():
642+
is_started = True
643+
else:
644+
log.warning("Failed to see startup log message; retrying...")
645+
time.sleep(TikaStartupSleep)
646+
try_count += 1
647+
648+
if not is_started:
649+
log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
650+
return False
651+
else:
652+
return True
611653

612654
def toFilename(urlOrPath):
613655
value = re.sub('[^\w\s-]', '-', urlOrPath).strip().lower()

0 commit comments

Comments
 (0)