7878 from urllib .parse import urlparse as urlparse
7979
8080if sys .version_info [0 ] < 3 :
81- import codecs
8281 open = codecs .open
8382
8483import requests
9493TikaVersion = os .getenv ('TIKA_VERSION' , '1.12' )
9594TikaJarPath = tempfile .gettempdir ()
9695TikaFilesPath = tempfile .gettempdir ()
96+ TikaServerLogFilePath = tempfile .gettempdir ()
9797TikaServerJar = os .getenv (
9898 'TIKA_SERVER_JAR' ,
9999 "http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/" + TikaVersion + "/tika-server-" + TikaVersion + ".jar" )
@@ -116,7 +116,7 @@ def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit()
116116
117117def runCommand (cmd , option , urlOrPaths , port , outDir = None , serverHost = ServerHost , tikaServerJar = TikaServerJar , verbose = Verbose , encode = EncodeUtf8 ):
118118 """Run the Tika command by calling the Tika server and return results in JSON format (or plain text)."""
119- # import pdb; pdb.set_trace()
119+ # import pdb; pdb.set_trace()
120120 if (cmd in 'parse' or cmd in 'detect' ) and (urlOrPaths == [] or urlOrPaths == None ):
121121 die ('No URLs/paths specified.' )
122122 serverEndpoint = 'http://' + serverHost + ':' + port
@@ -142,12 +142,12 @@ def getPaths(urlOrPaths):
142142 """
143143 paths = []
144144 for eachUrlOrPaths in urlOrPaths :
145- if os .path .isdir (eachUrlOrPaths ):
146- for root , directories , filenames in walk (eachUrlOrPaths ):
147- for filename in filenames :
148- paths .append (os .path .join (root ,filename ))
149- else :
150- paths .append (eachUrlOrPaths )
145+ if os .path .isdir (eachUrlOrPaths ):
146+ for root , directories , filenames in walk (eachUrlOrPaths ):
147+ for filename in filenames :
148+ paths .append (os .path .join (root ,filename ))
149+ else :
150+ paths .append (eachUrlOrPaths )
151151 return paths
152152
153153def parseAndSave (option , urlOrPaths , outDir = None , serverEndpoint = ServerEndpoint , verbose = Verbose , tikaServerJar = TikaServerJar ,
@@ -158,15 +158,15 @@ def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint,
158158 metaPaths = []
159159 paths = getPaths (urlOrPaths )
160160 for path in paths :
161- if outDir is None :
162- metaPath = path + metaExtension
163- else :
164- metaPath = os .path .join (outDir , os .path .split (path )[1 ] + metaExtension )
165- echo2 ('Writing %s' % metaPath )
166- with open (metaPath , 'w' , 'utf-8' ) as f :
167- f .write (parse1 (option , path , serverEndpoint , verbose , tikaServerJar , \
161+ if outDir is None :
162+ metaPath = path + metaExtension
163+ else :
164+ metaPath = os .path .join (outDir , os .path .split (path )[1 ] + metaExtension )
165+ echo2 ('Writing %s' % metaPath )
166+ with open (metaPath , 'w' , 'utf-8' ) as f :
167+ f .write (parse1 (option , path , serverEndpoint , verbose , tikaServerJar , \
168168 responseMimeType , services )[1 ] + u"\n " )
169- metaPaths .append (metaPath )
169+ metaPaths .append (metaPath )
170170 return metaPaths
171171
172172
@@ -181,7 +181,7 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
181181 responseMimeType = 'application/json' ,
182182 services = {'meta' : '/meta' , 'text' : '/tika' , 'all' : '/rmeta/text' }):
183183 """Parse the object and return extracted metadata and/or text in JSON format."""
184- path , type = getRemoteFile (urlOrPath , TikaFilesPath )
184+ path , file_type = getRemoteFile (urlOrPath , TikaFilesPath )
185185 if option not in services :
186186 warn ('config option must be one of meta, text, or all; using all.' )
187187 service = services .get (option , services ['all' ])
@@ -190,7 +190,7 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti
190190 {'Accept' : responseMimeType , 'Content-Disposition' : 'attachment; filename=%s' % os .path .basename (path )},
191191 verbose , tikaServerJar )
192192
193- if type == 'remote' : os .unlink (path )
193+ if file_type == 'remote' : os .unlink (path )
194194 return (status , response )
195195
196196def detectLang (option , urlOrPaths , serverEndpoint = ServerEndpoint , verbose = Verbose , tikaServerJar = TikaServerJar ,
@@ -349,7 +349,7 @@ def startServer(tikaServerJar, serverHost = ServerHost, port = Port):
349349 host = "0.0.0.0"
350350
351351 cmd = 'java -jar %s --port %i --host %s &' % (tikaServerJar , port , host )
352- logFile = open (os .path .join (TikaJarPath , 'tika-server.log' ), 'w' )
352+ logFile = open (os .path .join (TikaServerLogFilePath , 'tika-server.log' ), 'w' )
353353 cmd = Popen (cmd , stdout = logFile , stderr = STDOUT , shell = True )
354354 time .sleep (5 )
355355
0 commit comments