9494
9595"""
9696
97- import sys , os , getopt , time , codecs
97+ import sys , os , getopt , time , codecs , re
9898try :
9999 unicode_string = unicode
100100 binary_string = str
@@ -609,6 +609,11 @@ def startServer(tikaServerJar, serverHost = ServerHost, port = Port, classpath=N
609609 cmd = Popen (cmd , stdout = logFile , stderr = STDOUT , shell = True )
610610 time .sleep (5 )
611611
612+ def toFilename (urlOrPath ):
613+ value = re .sub ('[^\w\s-]' , '-' , urlOrPath ).strip ().lower ()
614+ return re .sub ('[-\s]+' , '-' , value ).strip ("-" )
615+
616+
612617def getRemoteFile (urlOrPath , destPath ):
613618 '''
614619 Fetches URL to local path or just returns absolute path.
@@ -622,7 +627,7 @@ def getRemoteFile(urlOrPath, destPath):
622627 elif urlp .scheme not in ('http' , 'https' ):
623628 return (urlOrPath , 'local' )
624629 else :
625- filename = urlOrPath . rsplit ( '/' , 1 )[ 1 ]
630+ filename = toFilename ( urlOrPath )
626631 destPath = destPath + '/' + filename
627632 log .info ('Retrieving %s to %s.' % (urlOrPath , destPath ))
628633 try :
0 commit comments