Skip to content

Commit 3d791d3

Browse files
committed
Add parameters to fetch subdomains, full list
Refactor caching system Require caching Version bump
1 parent 566dde3 commit 3d791d3

File tree

7 files changed

+125
-39
lines changed

7 files changed

+125
-39
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="tranco",
8-
version="0.7.1",
8+
version="0.8.0",
99
author="Victor Le Pochat",
1010
author_email="[email protected]",
1111
description="Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation",

tests/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
@pytest.fixture(scope="session")
66
def tranco():
77
t = Tranco(cache=True, cache_dir='.tranco')
8-
t.list() # prefetch list
98
return t
109

1110

tests/test_download.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
def test_daily(tranco):
2+
l = tranco.list(date="2024-01-01")
3+
assert l.list_id == "V929N"

tests/test_parameters.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
def test_full(tranco):
2+
l = tranco.list(list_id="93N82", full=True)
3+
assert len(l.list) == 8469
4+
5+
6+
def test_subdomains(tranco):
7+
l = tranco.list(date="2024-01-01", subdomains=True)
8+
assert l.list_id == "G6Y6K"
9+
10+
11+
def test_top_1m_after_full(tranco):
12+
lf = tranco.list(list_id="G6Y6K", full=True)
13+
assert len(lf.list) > 1000000
14+
ln = tranco.list(list_id="G6Y6K", full=False)
15+
assert len(ln.list) == 1000000
Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,5 @@
11
import uuid
22

3-
4-
def test_top_10(tranco_list):
5-
assert len(tranco_list.top(10)) == 10
6-
7-
8-
def test_top_1000000(tranco_list):
9-
assert len(tranco_list.top(1000000)) == 1000000
10-
11-
123
def test_domain_rank(tranco_list):
134
top_1 = tranco_list.top(1)[0]
145
assert tranco_list.rank(top_1) == 1

tests/test_top.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
def test_top_10(tranco_list):
2+
assert len(tranco_list.top(10)) == 10
3+
4+
5+
def test_top_1000000(tranco_list):
6+
assert len(tranco_list.top(1000000)) == 1000000
7+
8+

tranco/tranco.py

Lines changed: 98 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1+
import json
12
from datetime import datetime, timedelta
23
from io import BytesIO
4+
from itertools import islice
5+
36
import requests
47
import os
58
import platform
69
import zipfile
10+
from warnings import warn
11+
from enum import IntEnum
712

8-
VERSION = '0.7.1'
13+
VERSION = '0.8.0'
914

1015
class TrancoList():
1116
def __init__(self, date, list_id, lst):
@@ -20,91 +25,156 @@ def top(self, num=1000000):
2025
def rank(self, domain):
2126
return self.list.get(domain, -1)
2227

28+
class TrancoCacheType(IntEnum):
29+
NOT_CACHED = 0
30+
CACHED_NOT_FULL = 1
31+
CACHED_FULL = 2
32+
2333
class Tranco():
2434
def __init__(self, **kwargs):
2535
"""
2636
:param kwargs:
27-
cache: <bool> enables/disables caching, default: True
2837
cache_dir: <str> directory used to cache Tranco top lists, default: cwd + .tranco/
2938
account_email: <str> Account email address: retrieve from https://tranco-list.eu/account
3039
api_key: <str> API key: retrieve from https://tranco-list.eu/account
3140
"""
3241

33-
self.should_cache = kwargs.get('cache', True)
42+
# Caching is required.
3443
self.cache_dir = kwargs.get('cache_dir', None)
3544
if self.cache_dir is None:
3645
cwd = os.getcwd()
3746
self.cache_dir = os.path.join(cwd, '.tranco')
38-
39-
if self.should_cache and not os.path.exists(self.cache_dir):
47+
if not os.path.exists(self.cache_dir):
4048
os.mkdir(self.cache_dir)
49+
self.cache_metadata = {}
50+
self._load_cache_metadata()
4151

4252
self.account_email = kwargs.get('account_email')
4353
self.api_key = kwargs.get('api_key')
4454

4555
self.session = requests.Session()
4656
self.session.headers.update({'User-Agent': 'Python/{} python-requests/{} tranco-python/{}'.format(platform.python_version(), requests.__version__, VERSION)})
4757

48-
def _cache_path(self, date):
49-
return os.path.join(self.cache_dir, date + '-DEFAULT.csv')
58+
def _cache_metadata_path(self):
59+
return os.path.join(self.cache_dir, 'metadata.json')
60+
61+
def _cache_path(self, list_id):
62+
return os.path.join(self.cache_dir, '{}.csv'.format(list_id))
63+
64+
def _load_cache_metadata(self):
65+
if not os.path.exists(self._cache_metadata_path()):
66+
self._write_cache_metadata()
67+
with open(self._cache_metadata_path(), "rt") as f:
68+
self.cache_metadata = json.load(f)
69+
70+
def _write_cache_metadata(self):
71+
with open(self._cache_metadata_path(), 'wt') as f:
72+
json.dump(self.cache_metadata, f)
73+
74+
def _get_list_cache(self, list_id):
75+
return self.cache_metadata.get(list_id, TrancoCacheType.NOT_CACHED)
76+
77+
def _is_cached(self, list_id, full=False):
78+
if not list_id:
79+
raise ValueError("You must pass a list ID to cache a list.")
80+
list_cache = self._get_list_cache(list_id)
81+
if list_cache == TrancoCacheType.NOT_CACHED:
82+
return False
83+
84+
if full and (list_cache == TrancoCacheType.CACHED_NOT_FULL): # need full, but full not present
85+
return False
86+
return True
87+
88+
def _add_to_cache(self, list_id=None, full=False):
89+
if not list_id:
90+
raise ValueError("You must pass a list ID to cache a list.")
91+
self.cache_metadata[list_id] = max(TrancoCacheType.CACHED_FULL if full else TrancoCacheType.CACHED_NOT_FULL, self._get_list_cache(list_id))
92+
self._write_cache_metadata()
93+
94+
def clear_cache(self):
95+
for f in os.listdir(self.cache_dir):
96+
os.remove(os.path.join(self.cache_dir, f))
97+
self._load_cache_metadata()
5098

51-
def list(self, date=None, list_id=None):
99+
def list(self, date=None, list_id=None, subdomains=False, full=False):
100+
"""
101+
Retrieve a Tranco top list.
102+
:param date: Get the daily list for this date. If not given, the latest list is returned.
103+
Combine with `subdomains` to select whether subdomains are included.
104+
:param list_id: Get the list with this ID. If neither the list ID nor date are given, the latest list is returned.
105+
:param subdomains: Include subdomains in the list. Only relevant when requesting a daily list. Default: False.
106+
:param full: Retrieve the full list (else only the top million). Default: False.
107+
:return: TrancoList object for the requested list.
108+
"""
52109
if date and list_id:
53110
raise ValueError("You can't pass a date as well as a list ID.")
111+
if list_id and subdomains:
112+
warn("Subdomains parameter is ignored when passing a list ID.")
54113

55114
if not list_id:
56115
if (not date) or (date == 'latest'): # no arguments given: default to latest list
57116
yesterday = (datetime.utcnow() - timedelta(days=1))
58117
date = yesterday.strftime('%Y-%m-%d')
59-
list_id = self._get_list_id_for_date(date)
118+
list_id = self._get_list_id_for_date(date, subdomains=subdomains)
60119

61-
if self.should_cache and os.path.exists(self._cache_path(list_id)):
62-
with open(self._cache_path(list_id)) as f:
63-
top_list_text = f.read()
64-
else:
65-
top_list_text = self._download_zip_file(list_id)
120+
if not self._is_cached(list_id, full):
121+
self._download_file(list_id, full) # download list and load into cache
122+
with open(self._cache_path(list_id)) as f: # read list from cache
123+
if full:
124+
top_list_lines = f.read().splitlines()
125+
else:
126+
top_list_lines = list(islice(f, 1000000))
66127

67-
return TrancoList(date, list_id, list(map(lambda x: x[x.index(',') + 1:], top_list_text.splitlines())))
128+
return TrancoList(date, list_id, list(map(lambda x: x[x.index(',') + 1:], top_list_lines)))
68129

69-
def _get_list_id_for_date(self, date):
70-
r1 = self.session.get('https://tranco-list.eu/daily_list_id?date={}'.format(date))
130+
def _get_list_id_for_date(self, date, subdomains=False):
131+
r1 = self.session.get('https://tranco-list.eu/daily_list_id?date={}&subdomains={}'.format(date, str(subdomains).lower()))
71132
if r1.status_code == 200:
72133
return r1.text
73134
else:
74135
raise AttributeError("The daily list for this date is currently unavailable.")
75136

137+
def _download_file(self, list_id, full=False):
138+
if full:
139+
self._download_full_file(list_id)
140+
else:
141+
self._download_zip_file(list_id)
142+
self._add_to_cache(list_id, full)
143+
76144
def _download_zip_file(self, list_id):
77145
download_url = 'https://tranco-list.eu/download_daily/{}'.format(list_id)
78146
r = self.session.get(download_url, stream=True)
79147
if r.status_code == 200:
80148
with zipfile.ZipFile(BytesIO(r.content)) as z:
81149
with z.open('top-1m.csv') as csvf:
82150
file_bytes = csvf.read()
83-
if self.should_cache:
84-
with open(self._cache_path(list_id), 'wb') as f:
85-
f.write(file_bytes)
86-
lst = file_bytes.decode("utf-8")
87-
return lst
151+
with open(self._cache_path(list_id), 'wb') as f:
152+
f.write(file_bytes)
88153
elif r.status_code == 403:
89154
# List not available as ZIP file
90155
download_url = 'https://tranco-list.eu/download/{}/1000000'.format(list_id)
91156
r2 = self.session.get(download_url)
92157
if r2.status_code == 200:
93158
file_bytes = r2.content
94-
if self.should_cache:
95-
with open(self._cache_path(list_id), 'wb') as f:
96-
f.write(file_bytes)
97-
lst = file_bytes.decode("utf-8")
98-
return lst
159+
with open(self._cache_path(list_id), 'wb') as f:
160+
f.write(file_bytes)
99161
else:
100162
raise AttributeError("The daily list for this date is currently unavailable.")
101163
elif r.status_code == 502:
102164
# List unavailable (bad gateway)
103-
raise AttributeError("The daily list for this date is currently unavailable.")
165+
raise AttributeError("This list is currently unavailable.")
104166
else:
105167
# List unavailable (non-success status code)
106168
raise AttributeError("The daily list for this date is currently unavailable.")
107169

170+
def _download_full_file(self, list_id):
171+
download_url = 'https://tranco-list.eu/download/{}/full'.format(list_id)
172+
r = self.session.get(download_url)
173+
if r.status_code == 200:
174+
file_bytes = r.content
175+
with open(self._cache_path(list_id), 'wb') as f:
176+
f.write(file_bytes)
177+
108178
def configure(self, configuration):
109179
"""
110180
Configure a custom list (https://tranco-list.eu/configure).

0 commit comments

Comments
 (0)