1616from itertools import chain , repeat
1717from functools import lru_cache
1818from collections import OrderedDict
19- from urllib .parse import urlparse , unquote as urlunquote
20- from urllib .request import urlopen
19+ from urllib .parse import urlparse , urlsplit , urlunsplit , unquote as urlunquote
20+ from urllib .request import urlopen , Request
2121
2222import bottleneck as bn
2323import numpy as np
@@ -809,10 +809,17 @@ def write(cls, filename, tree):
809809
810810
811811class UrlReader (FileFormat ):
812+ @staticmethod
813+ def urlopen (url ):
814+ req = Request (
815+ url ,
816+ # Avoid 403 error with servers that dislike scrapers
817+ headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux) Gecko/20100101 Firefox/' })
818+ return urlopen (req , timeout = 10 )
819+
812820 def read (self ):
813821 self .filename = self ._trim (self ._resolve_redirects (self .filename ))
814-
815- with contextlib .closing (urlopen (self .filename , timeout = 10 )) as response :
822+ with contextlib .closing (self .urlopen (self .filename )) as response :
816823 name = self ._suggest_filename (response .headers ['content-disposition' ])
817824 with NamedTemporaryFile (suffix = name , delete = False ) as f :
818825 f .write (response .read ())
@@ -828,12 +835,14 @@ def read(self):
828835
829836 def _resolve_redirects (self , url ):
830837 # Resolve (potential) redirects to a final URL
831- with contextlib .closing (urlopen (url , timeout = 10 )) as response :
838+ with contextlib .closing (self . urlopen (url )) as response :
832839 return response .url
833840
834- def _trim (self , url ):
841+ @classmethod
842+ def _trim (cls , url ):
835843 URL_TRIMMERS = (
836- self ._trim_googlesheet_url ,
844+ cls ._trim_googlesheet ,
845+ cls ._trim_dropbox ,
837846 )
838847 for trim in URL_TRIMMERS :
839848 try :
@@ -844,7 +853,8 @@ def _trim(self, url):
844853 break
845854 return url
846855
847- def _trim_googlesheet_url (self , url ):
856+ @staticmethod
857+ def _trim_googlesheet (url ):
848858 match = re .match (r'(?:https?://)?(?:www\.)?'
849859 'docs\.google\.com/spreadsheets/d/'
850860 '(?P<workbook_id>[-\w_]+)'
@@ -861,6 +871,13 @@ def _trim_googlesheet_url(self, url):
861871 url += '&gid=' + sheet
862872 return url
863873
874+ @staticmethod
875+ def _trim_dropbox (url ):
876+ parts = urlsplit (url )
877+ if not parts .netloc .endswith ('dropbox.com' ):
878+ raise ValueError
879+ return urlunsplit (parts ._replace (query = 'dl=1' ))
880+
864881 def _suggest_filename (self , content_disposition ):
865882 default_name = re .sub (r'[\\:/]' , '_' , urlparse (self .filename ).path )
866883
0 commit comments