Skip to content

Commit ce4c3ca

Browse files
Merge pull request #444 from afuetterer/442-tests
fix: replace `urlretrieve` in `getRemoteFile`
2 parents 73231e3 + deec412 commit ce4c3ca

File tree

2 files changed

+61
-33
lines changed

2 files changed

+61
-33
lines changed

tika/tests/test_tika.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
17-
#
18-
# python -m unittest tests.tests
17+
1918
import os
2019
import unittest
2120

@@ -30,7 +29,7 @@ class CreateTest(unittest.TestCase):
3029
def test_remote_pdf(self):
3130
"""parse remote PDF"""
3231
self.assertTrue(tika.parser.from_file(
33-
'http://appsrv.achd.net/reports/rwservlet?food_rep_insp&P_ENCOUNTER=201504160015'))
32+
'https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf'))
3433

3534
def test_remote_html(self):
3635
"""parse remote HTML"""

tika/tika.py

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -105,17 +105,13 @@
105105
"""
106106

107107
import sys, os, getopt, time, codecs, re
108+
from pathlib import Path
108109
try:
109110
unicode_string = unicode
110111
binary_string = str
111112
except NameError:
112113
unicode_string = str
113114
binary_string = bytes
114-
115-
try:
116-
from urllib import urlretrieve
117-
except ImportError:
118-
from urllib.request import urlretrieve
119115
try:
120116
from urlparse import urlparse
121117
except ImportError:
@@ -757,6 +753,62 @@ def _is_file_object(f):
757753

758754
return isinstance(f, file_types)
759755

756+
757+
def _urlretrieve(
758+
url: str,
759+
filename: str,
760+
chunk_size: int = 8192,
761+
timeout: int = 30,
762+
verify_ssl: bool = True,
763+
) -> str:
764+
"""
765+
Download a file from a URL using requests with streaming support.
766+
767+
Args:
768+
url: The URL to download from.
769+
filepath: The local file path where the file will be saved.
770+
chunk_size: Size of chunks to download at a time in bytes (default: 8192).
771+
timeout: Request timeout in seconds (default: 30).
772+
verify_ssl: Whether to verify SSL certificates (default: True).
773+
774+
Returns:
775+
The filepath where the file was saved.
776+
777+
Raises:
778+
requests.RequestException: If the download fails.
779+
IOError: If there's an issue writing to the file.
780+
"""
781+
headers = {"user-agent": "tika-python"}
782+
783+
# Ensure the directory exists
784+
Path(filename).parent.mkdir(parents=True, exist_ok=True)
785+
786+
try:
787+
response = requests.get(
788+
url,
789+
headers=headers,
790+
stream=True,
791+
timeout=timeout,
792+
verify=verify_ssl,
793+
)
794+
response.raise_for_status()
795+
796+
bytes_downloaded = 0
797+
with open(filename, "wb") as f:
798+
for chunk in response.iter_content(chunk_size=chunk_size):
799+
if chunk: # Filter out keep-alive chunks
800+
f.write(chunk)
801+
bytes_downloaded += len(chunk)
802+
803+
return filename
804+
805+
except requests.RequestException as e:
806+
# Clean up partial file on error
807+
if os.path.exists(filename):
808+
os.remove(filename)
809+
raise RuntimeError(f"Failed to download {url}: {e}") from e
810+
811+
760812
def getRemoteFile(urlOrPath, destPath):
761813
'''
762814
Fetches URL to local path or just returns absolute path.
@@ -777,18 +829,7 @@ def getRemoteFile(urlOrPath, destPath):
777829
filename = toFilename(urlOrPath)
778830
destPath = destPath + '/' + filename
779831
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
780-
try:
781-
urlretrieve(urlOrPath, destPath)
782-
except IOError:
783-
# monkey patch fix for SSL/Windows per Tika-Python #54
784-
# https://github.com/chrismattmann/tika-python/issues/54
785-
import ssl
786-
if hasattr(ssl, '_create_unverified_context'):
787-
ssl._create_default_https_context = ssl._create_unverified_context
788-
# delete whatever we had there
789-
if os.path.exists(destPath) and os.path.isfile(destPath):
790-
os.remove(destPath)
791-
urlretrieve(urlOrPath, destPath)
832+
_urlretrieve(urlOrPath, destPath)
792833
return (destPath, 'remote')
793834

794835
def getRemoteJar(urlOrPath, destPath):
@@ -803,19 +844,7 @@ def getRemoteJar(urlOrPath, destPath):
803844
return (os.path.abspath(urlOrPath), 'local')
804845
else:
805846
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
806-
try:
807-
urlretrieve(urlOrPath, destPath)
808-
except IOError:
809-
# monkey patch fix for SSL/Windows per Tika-Python #54
810-
# https://github.com/chrismattmann/tika-python/issues/54
811-
import ssl
812-
if hasattr(ssl, '_create_unverified_context'):
813-
ssl._create_default_https_context = ssl._create_unverified_context
814-
# delete whatever we had there
815-
if os.path.exists(destPath) and os.path.isfile(destPath):
816-
os.remove(destPath)
817-
urlretrieve(urlOrPath, destPath)
818-
847+
_urlretrieve(urlOrPath, destPath)
819848
return (destPath, 'remote')
820849

821850
def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):

0 commit comments

Comments
 (0)