105105"""
106106
107107import sys , os , getopt , time , codecs , re
108+ from pathlib import Path
108109try :
109110 unicode_string = unicode
110111 binary_string = str
111112except NameError :
112113 unicode_string = str
113114 binary_string = bytes
114-
115- try :
116- from urllib import urlretrieve
117- except ImportError :
118- from urllib .request import urlretrieve
119115try :
120116 from urlparse import urlparse
121117except ImportError :
@@ -757,6 +753,62 @@ def _is_file_object(f):
757753
758754 return isinstance (f , file_types )
759755
756+
757+ def _urlretrieve (
758+ url : str ,
759+ filename : str ,
760+ chunk_size : int = 8192 ,
761+ timeout : int = 30 ,
762+ verify_ssl : bool = True ,
763+ ) -> str :
764+ """
765+ Download a file from a URL using requests with streaming support.
766+
767+ Args:
768+ url: The URL to download from.
769+ filepath: The local file path where the file will be saved.
770+ chunk_size: Size of chunks to download at a time in bytes (default: 8192).
771+ timeout: Request timeout in seconds (default: 30).
772+ verify_ssl: Whether to verify SSL certificates (default: True).
773+
774+ Returns:
775+ The filepath where the file was saved.
776+
777+ Raises:
778+ requests.RequestException: If the download fails.
779+ IOError: If there's an issue writing to the file.
780+ """
781+ headers = {"user-agent" : "tika-python" }
782+
783+ # Ensure the directory exists
784+ Path (filename ).parent .mkdir (parents = True , exist_ok = True )
785+
786+ try :
787+ response = requests .get (
788+ url ,
789+ headers = headers ,
790+ stream = True ,
791+ timeout = timeout ,
792+ verify = verify_ssl ,
793+ )
794+ response .raise_for_status ()
795+
796+ bytes_downloaded = 0
797+ with open (filename , "wb" ) as f :
798+ for chunk in response .iter_content (chunk_size = chunk_size ):
799+ if chunk : # Filter out keep-alive chunks
800+ f .write (chunk )
801+ bytes_downloaded += len (chunk )
802+
803+ return filename
804+
805+ except requests .RequestException as e :
806+ # Clean up partial file on error
807+ if os .path .exists (filename ):
808+ os .remove (filename )
809+ raise RuntimeError (f"Failed to download { url } : { e } " ) from e
810+
811+
760812def getRemoteFile (urlOrPath , destPath ):
761813 '''
762814 Fetches URL to local path or just returns absolute path.
@@ -777,18 +829,7 @@ def getRemoteFile(urlOrPath, destPath):
777829 filename = toFilename (urlOrPath )
778830 destPath = destPath + '/' + filename
779831 log .info ('Retrieving %s to %s.' % (urlOrPath , destPath ))
780- try :
781- urlretrieve (urlOrPath , destPath )
782- except IOError :
783- # monkey patch fix for SSL/Windows per Tika-Python #54
784- # https://github.com/chrismattmann/tika-python/issues/54
785- import ssl
786- if hasattr (ssl , '_create_unverified_context' ):
787- ssl ._create_default_https_context = ssl ._create_unverified_context
788- # delete whatever we had there
789- if os .path .exists (destPath ) and os .path .isfile (destPath ):
790- os .remove (destPath )
791- urlretrieve (urlOrPath , destPath )
832+ _urlretrieve (urlOrPath , destPath )
792833 return (destPath , 'remote' )
793834
794835def getRemoteJar (urlOrPath , destPath ):
@@ -803,19 +844,7 @@ def getRemoteJar(urlOrPath, destPath):
803844 return (os .path .abspath (urlOrPath ), 'local' )
804845 else :
805846 log .info ('Retrieving %s to %s.' % (urlOrPath , destPath ))
806- try :
807- urlretrieve (urlOrPath , destPath )
808- except IOError :
809- # monkey patch fix for SSL/Windows per Tika-Python #54
810- # https://github.com/chrismattmann/tika-python/issues/54
811- import ssl
812- if hasattr (ssl , '_create_unverified_context' ):
813- ssl ._create_default_https_context = ssl ._create_unverified_context
814- # delete whatever we had there
815- if os .path .exists (destPath ) and os .path .isfile (destPath ):
816- os .remove (destPath )
817- urlretrieve (urlOrPath , destPath )
818-
847+ _urlretrieve (urlOrPath , destPath )
819848 return (destPath , 'remote' )
820849
821850def checkPortIsOpen (remoteServerHost = ServerHost , port = Port ):
0 commit comments