1+ import json
12from datetime import datetime , timedelta
23from io import BytesIO
4+ from itertools import islice
5+
36import requests
47import os
58import platform
69import zipfile
10+ from warnings import warn
11+ from enum import IntEnum
712
8- VERSION = '0.7.1 '
13+ VERSION = '0.8.0 '
914
1015class TrancoList ():
1116 def __init__ (self , date , list_id , lst ):
@@ -20,91 +25,156 @@ def top(self, num=1000000):
2025 def rank (self , domain ):
2126 return self .list .get (domain , - 1 )
2227
28+ class TrancoCacheType (IntEnum ):
29+ NOT_CACHED = 0
30+ CACHED_NOT_FULL = 1
31+ CACHED_FULL = 2
32+
2333class Tranco ():
2434 def __init__ (self , ** kwargs ):
2535 """
2636 :param kwargs:
27- cache: <bool> enables/disables caching, default: True
2837 cache_dir: <str> directory used to cache Tranco top lists, default: cwd + .tranco/
2938 account_email: <str> Account email address: retrieve from https://tranco-list.eu/account
3039 api_key: <str> API key: retrieve from https://tranco-list.eu/account
3140 """
3241
33- self . should_cache = kwargs . get ( 'cache' , True )
42+ # Caching is required.
3443 self .cache_dir = kwargs .get ('cache_dir' , None )
3544 if self .cache_dir is None :
3645 cwd = os .getcwd ()
3746 self .cache_dir = os .path .join (cwd , '.tranco' )
38-
39- if self .should_cache and not os .path .exists (self .cache_dir ):
47+ if not os .path .exists (self .cache_dir ):
4048 os .mkdir (self .cache_dir )
49+ self .cache_metadata = {}
50+ self ._load_cache_metadata ()
4151
4252 self .account_email = kwargs .get ('account_email' )
4353 self .api_key = kwargs .get ('api_key' )
4454
4555 self .session = requests .Session ()
4656 self .session .headers .update ({'User-Agent' : 'Python/{} python-requests/{} tranco-python/{}' .format (platform .python_version (), requests .__version__ , VERSION )})
4757
48- def _cache_path (self , date ):
49- return os .path .join (self .cache_dir , date + '-DEFAULT.csv' )
58+ def _cache_metadata_path (self ):
59+ return os .path .join (self .cache_dir , 'metadata.json' )
60+
61+ def _cache_path (self , list_id ):
62+ return os .path .join (self .cache_dir , '{}.csv' .format (list_id ))
63+
64+ def _load_cache_metadata (self ):
65+ if not os .path .exists (self ._cache_metadata_path ()):
66+ self ._write_cache_metadata ()
67+ with open (self ._cache_metadata_path (), "rt" ) as f :
68+ self .cache_metadata = json .load (f )
69+
70+ def _write_cache_metadata (self ):
71+ with open (self ._cache_metadata_path (), 'wt' ) as f :
72+ json .dump (self .cache_metadata , f )
73+
74+ def _get_list_cache (self , list_id ):
75+ return self .cache_metadata .get (list_id , TrancoCacheType .NOT_CACHED )
76+
77+ def _is_cached (self , list_id , full = False ):
78+ if not list_id :
79+ raise ValueError ("You must pass a list ID to cache a list." )
80+ list_cache = self ._get_list_cache (list_id )
81+ if list_cache == TrancoCacheType .NOT_CACHED :
82+ return False
83+
84+ if full and (list_cache == TrancoCacheType .CACHED_NOT_FULL ): # need full, but full not present
85+ return False
86+ return True
87+
88+ def _add_to_cache (self , list_id = None , full = False ):
89+ if not list_id :
90+ raise ValueError ("You must pass a list ID to cache a list." )
91+ self .cache_metadata [list_id ] = max (TrancoCacheType .CACHED_FULL if full else TrancoCacheType .CACHED_NOT_FULL , self ._get_list_cache (list_id ))
92+ self ._write_cache_metadata ()
93+
94+ def clear_cache (self ):
95+ for f in os .listdir (self .cache_dir ):
96+ os .remove (os .path .join (self .cache_dir , f ))
97+ self ._load_cache_metadata ()
5098
51- def list (self , date = None , list_id = None ):
99+ def list (self , date = None , list_id = None , subdomains = False , full = False ):
100+ """
101+ Retrieve a Tranco top list.
102+ :param date: Get the daily list for this date. If not given, the latest list is returned.
103+ Combine with `subdomains` to select whether subdomains are included.
104+ :param list_id: Get the list with this ID. If neither the list ID nor date are given, the latest list is returned.
105+ :param subdomains: Include subdomains in the list. Only relevant when requesting a daily list. Default: False.
106+ :param full: Retrieve the full list (else only the top million). Default: False.
107+ :return: TrancoList object for the requested list.
108+ """
52109 if date and list_id :
53110 raise ValueError ("You can't pass a date as well as a list ID." )
111+ if list_id and subdomains :
112+ warn ("Subdomains parameter is ignored when passing a list ID." )
54113
55114 if not list_id :
56115 if (not date ) or (date == 'latest' ): # no arguments given: default to latest list
57116 yesterday = (datetime .utcnow () - timedelta (days = 1 ))
58117 date = yesterday .strftime ('%Y-%m-%d' )
59- list_id = self ._get_list_id_for_date (date )
118+ list_id = self ._get_list_id_for_date (date , subdomains = subdomains )
60119
61- if self .should_cache and os .path .exists (self ._cache_path (list_id )):
62- with open (self ._cache_path (list_id )) as f :
63- top_list_text = f .read ()
64- else :
65- top_list_text = self ._download_zip_file (list_id )
120+ if not self ._is_cached (list_id , full ):
121+ self ._download_file (list_id , full ) # download list and load into cache
122+ with open (self ._cache_path (list_id )) as f : # read list from cache
123+ if full :
124+ top_list_lines = f .read ().splitlines ()
125+ else :
126+ top_list_lines = list (islice (f , 1000000 ))
66127
67- return TrancoList (date , list_id , list (map (lambda x : x [x .index (',' ) + 1 :], top_list_text . splitlines () )))
128+ return TrancoList (date , list_id , list (map (lambda x : x [x .index (',' ) + 1 :], top_list_lines )))
68129
69- def _get_list_id_for_date (self , date ):
70- r1 = self .session .get ('https://tranco-list.eu/daily_list_id?date={}' .format (date ))
130+ def _get_list_id_for_date (self , date , subdomains = False ):
131+ r1 = self .session .get ('https://tranco-list.eu/daily_list_id?date={}&subdomains={} ' .format (date , str ( subdomains ). lower () ))
71132 if r1 .status_code == 200 :
72133 return r1 .text
73134 else :
74135 raise AttributeError ("The daily list for this date is currently unavailable." )
75136
137+ def _download_file (self , list_id , full = False ):
138+ if full :
139+ self ._download_full_file (list_id )
140+ else :
141+ self ._download_zip_file (list_id )
142+ self ._add_to_cache (list_id , full )
143+
76144 def _download_zip_file (self , list_id ):
77145 download_url = 'https://tranco-list.eu/download_daily/{}' .format (list_id )
78146 r = self .session .get (download_url , stream = True )
79147 if r .status_code == 200 :
80148 with zipfile .ZipFile (BytesIO (r .content )) as z :
81149 with z .open ('top-1m.csv' ) as csvf :
82150 file_bytes = csvf .read ()
83- if self .should_cache :
84- with open (self ._cache_path (list_id ), 'wb' ) as f :
85- f .write (file_bytes )
86- lst = file_bytes .decode ("utf-8" )
87- return lst
151+ with open (self ._cache_path (list_id ), 'wb' ) as f :
152+ f .write (file_bytes )
88153 elif r .status_code == 403 :
89154 # List not available as ZIP file
90155 download_url = 'https://tranco-list.eu/download/{}/1000000' .format (list_id )
91156 r2 = self .session .get (download_url )
92157 if r2 .status_code == 200 :
93158 file_bytes = r2 .content
94- if self .should_cache :
95- with open (self ._cache_path (list_id ), 'wb' ) as f :
96- f .write (file_bytes )
97- lst = file_bytes .decode ("utf-8" )
98- return lst
159+ with open (self ._cache_path (list_id ), 'wb' ) as f :
160+ f .write (file_bytes )
99161 else :
100162 raise AttributeError ("The daily list for this date is currently unavailable." )
101163 elif r .status_code == 502 :
102164 # List unavailable (bad gateway)
103- raise AttributeError ("The daily list for this date is currently unavailable." )
165+ raise AttributeError ("This list is currently unavailable." )
104166 else :
105167 # List unavailable (non-success status code)
106168 raise AttributeError ("The daily list for this date is currently unavailable." )
107169
170+ def _download_full_file (self , list_id ):
171+ download_url = 'https://tranco-list.eu/download/{}/full' .format (list_id )
172+ r = self .session .get (download_url )
173+ if r .status_code == 200 :
174+ file_bytes = r .content
175+ with open (self ._cache_path (list_id ), 'wb' ) as f :
176+ f .write (file_bytes )
177+
108178 def configure (self , configuration ):
109179 """
110180 Configure a custom list (https://tranco-list.eu/configure).
0 commit comments