Skip to content

Commit 7c41923

Browse files
Merge pull request #165 from ekeydar/master
add toFilename to handle translation of url to filename
2 parents 5f107aa + bfd8606 commit 7c41923

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

tika/tika.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
9595
"""
9696

97-
import sys, os, getopt, time, codecs
97+
import sys, os, getopt, time, codecs, re
9898
try:
9999
unicode_string = unicode
100100
binary_string = str
@@ -609,6 +609,11 @@ def startServer(tikaServerJar, serverHost = ServerHost, port = Port, classpath=N
609609
cmd = Popen(cmd , stdout= logFile, stderr = STDOUT, shell =True)
610610
time.sleep(5)
611611

612+
def toFilename(urlOrPath):
613+
value = re.sub('[^\w\s-]', '-', urlOrPath).strip().lower()
614+
return re.sub('[-\s]+', '-', value).strip("-")
615+
616+
612617
def getRemoteFile(urlOrPath, destPath):
613618
'''
614619
Fetches URL to local path or just returns absolute path.
@@ -622,7 +627,7 @@ def getRemoteFile(urlOrPath, destPath):
622627
elif urlp.scheme not in ('http', 'https'):
623628
return (urlOrPath, 'local')
624629
else:
625-
filename = urlOrPath.rsplit('/',1)[1]
630+
filename = toFilename(urlOrPath)
626631
destPath = destPath + '/' +filename
627632
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
628633
try:

0 commit comments

Comments
 (0)