Skip to content

Commit 3e0b2b5

Browse files
committed
io.UrlReader: support Dropbox URLs
1 parent 8ae84ef commit 3e0b2b5

File tree

2 files changed

+35
-16
lines changed

2 files changed

+35
-16
lines changed

Orange/data/io.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
from itertools import chain, repeat
1717
from functools import lru_cache
1818
from collections import OrderedDict
19-
from urllib.parse import urlparse, unquote as urlunquote
20-
from urllib.request import urlopen
19+
from urllib.parse import urlparse, urlsplit, urlunsplit, unquote as urlunquote
20+
from urllib.request import urlopen, Request
2121

2222
import bottleneck as bn
2323
import numpy as np
@@ -809,10 +809,17 @@ def write(cls, filename, tree):
809809

810810

811811
class UrlReader(FileFormat):
812+
@staticmethod
813+
def urlopen(url):
814+
req = Request(
815+
url,
816+
# Avoid 403 error with servers that dislike scrapers
817+
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux) Gecko/20100101 Firefox/'})
818+
return urlopen(req, timeout=10)
819+
812820
def read(self):
813821
self.filename = self._trim(self._resolve_redirects(self.filename))
814-
815-
with contextlib.closing(urlopen(self.filename, timeout=10)) as response:
822+
with contextlib.closing(self.urlopen(self.filename)) as response:
816823
name = self._suggest_filename(response.headers['content-disposition'])
817824
with NamedTemporaryFile(suffix=name, delete=False) as f:
818825
f.write(response.read())
@@ -828,12 +835,14 @@ def read(self):
828835

829836
def _resolve_redirects(self, url):
830837
# Resolve (potential) redirects to a final URL
831-
with contextlib.closing(urlopen(url, timeout=10)) as response:
838+
with contextlib.closing(self.urlopen(url)) as response:
832839
return response.url
833840

834-
def _trim(self, url):
841+
@classmethod
842+
def _trim(cls, url):
835843
URL_TRIMMERS = (
836-
self._trim_googlesheet_url,
844+
cls._trim_googlesheet,
845+
cls._trim_dropbox,
837846
)
838847
for trim in URL_TRIMMERS:
839848
try:
@@ -844,7 +853,8 @@ def _trim(self, url):
844853
break
845854
return url
846855

847-
def _trim_googlesheet_url(self, url):
856+
@staticmethod
857+
def _trim_googlesheet(url):
848858
match = re.match(r'(?:https?://)?(?:www\.)?'
849859
'docs\.google\.com/spreadsheets/d/'
850860
'(?P<workbook_id>[-\w_]+)'
@@ -861,6 +871,13 @@ def _trim_googlesheet_url(self, url):
861871
url += '&gid=' + sheet
862872
return url
863873

874+
@staticmethod
875+
def _trim_dropbox(url):
876+
parts = urlsplit(url)
877+
if not parts.netloc.endswith('dropbox.com'):
878+
raise ValueError
879+
return urlunsplit(parts._replace(query='dl=1'))
880+
864881
def _suggest_filename(self, content_disposition):
865882
default_name = re.sub(r'[\\:/]', '_', urlparse(self.filename).path)
866883

Orange/tests/test_table.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,8 +1313,6 @@ def test_load_from_url(self):
13131313
class _MockUrlOpen(MagicMock):
13141314
headers = {'content-disposition': 'attachment; filename="Something-FormResponses.tsv"; '
13151315
'filename*=UTF-8''Something%20%28Responses%29.tsv'}
1316-
url = 'https://docs.google.com/spreadsheets/d/ABCD/edit'
1317-
13181316
def __enter__(self): return self
13191317

13201318
def __exit__(self, *args, **kwargs): pass
@@ -1327,12 +1325,16 @@ def read(self): return b'''\
13271325
urlopen = _MockUrlOpen()
13281326

13291327
@patch('Orange.data.io.urlopen', urlopen)
1330-
def test_google_sheets(self):
1331-
d = data.Table(self.urlopen.url)
1332-
self.urlopen.assert_called_with('https://docs.google.com/spreadsheets/d/ABCD/export?format=tsv',
1333-
timeout=10)
1334-
self.assertEqual(len(d), 2)
1335-
self.assertEqual(d.name, 'Something-FormResponses')
1328+
def test_trimmed_urls(self):
1329+
for url in ('https://docs.google.com/spreadsheets/d/ABCD/edit',
1330+
'https://www.dropbox.com/s/ABCD/filename.csv'):
1331+
self._MockUrlOpen.url = url
1332+
d = data.Table(url)
1333+
request = self.urlopen.call_args[0][0]
1334+
self.assertNotEqual(url, request.full_url)
1335+
self.assertIn('Mozilla/5.0', request.headers.get('User-agent', ''))
1336+
self.assertEqual(len(d), 2)
1337+
self.assertEqual(d.name, 'Something-FormResponses')
13361338

13371339

13381340
class CreateTableWithDomain(TableTests):

0 commit comments

Comments
 (0)