Skip to content

Commit c457fb1

Browse files
committed
improve clarity and up max ipv6 ASNs
1 parent 87c7dcc commit c457fb1

File tree

1 file changed

+53
-30
lines changed

1 file changed

+53
-30
lines changed

contrib/seeds/makeseeds.py

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,16 @@
1010
import sys
1111
import dns.resolver
1212
import collections
13+
from typing import List, Dict, Union
1314

1415
NSEEDS=512
1516

16-
MAX_SEEDS_PER_ASN=2
17+
MAX_SEEDS_PER_ASN = {
18+
'ipv4': 2,
19+
'ipv6': 10,
20+
}
1721

18-
MIN_BLOCKS = 337600
22+
MIN_BLOCKS = 730000
1923

2024
# These are hosts that have been observed to be behaving strangely (e.g.
2125
# aggressively connecting to every node).
@@ -40,9 +44,13 @@
4044
r"23.99"
4145
r")")
4246

43-
def parseline(line):
47+
def parseline(line: str) -> Union[dict, None]:
48+
""" Parses a line from `seeds_main.txt` into a dictionary of details for that line.
49+
or `None`, if the line could not be parsed.
50+
"""
4451
sline = line.split()
4552
if len(sline) < 11:
53+
# line too short to be valid, skip it.
4654
return None
4755
m = PATTERN_IPV4.match(sline[0])
4856
sortkey = None
@@ -107,25 +115,26 @@ def parseline(line):
107115
'sortkey': sortkey,
108116
}
109117

110-
def dedup(ips):
111-
'''deduplicate by address,port'''
118+
def dedup(ips: List[Dict]) -> List[Dict]:
119+
""" Remove duplicates from `ips` where multiple ips share address and port. """
112120
d = {}
113121
for ip in ips:
114122
d[ip['ip'],ip['port']] = ip
115123
return list(d.values())
116124

117-
def filtermultiport(ips):
118-
'''Filter out hosts with more nodes per IP'''
125+
def filtermultiport(ips: List[Dict]) -> List[Dict]:
126+
""" Filter out hosts with more nodes per IP"""
119127
hist = collections.defaultdict(list)
120128
for ip in ips:
121129
hist[ip['sortkey']].append(ip)
122130
return [value[0] for (key,value) in list(hist.items()) if len(value)==1]
123131

124-
def lookup_asn(net, ip):
125-
'''
126-
Look up the asn for an IP (4 or 6) address by querying cymru.com, or None
127-
if it could not be found.
128-
'''
132+
def lookup_asn(net: str, ip: str) -> Union[int, None]:
133+
""" Look up the asn for an `ip` address by querying cymru.com
134+
on network `net` (e.g. ipv4 or ipv6).
135+
136+
Returns in integer ASN or None if it could not be found.
137+
"""
129138
try:
130139
if net == 'ipv4':
131140
ipaddr = ip
@@ -147,20 +156,33 @@ def lookup_asn(net, ip):
147156
return None
148157

149158
# Based on Greg Maxwell's seed_filter.py
150-
def filterbyasn(ips, max_per_asn, max_per_net):
159+
def filterbyasn(ips: List[Dict], max_per_asn: Dict, max_per_net: int) -> List[Dict]:
160+
""" Prunes `ips` by
161+
(a) trimming ips to have at most `max_per_net` ips from each net (e.g. ipv4, ipv6); and
162+
(b) trimming ips to have at most `max_per_asn` ips from each asn in each net.
163+
"""
151164
# Sift out ips by type
152165
ips_ipv46 = [ip for ip in ips if ip['net'] in ['ipv4', 'ipv6']]
153166
ips_onion = [ip for ip in ips if ip['net'] == 'onion']
154167

155168
# Filter IPv46 by ASN, and limit to max_per_net per network
156169
result = []
157-
net_count = collections.defaultdict(int)
158-
asn_count = collections.defaultdict(int)
159-
for ip in ips_ipv46:
170+
net_count: Dict[str, int] = collections.defaultdict(int)
171+
asn_count: Dict[int, int] = collections.defaultdict(int)
172+
173+
for i, ip in enumerate(ips_ipv46):
174+
if i % 10 == 0:
175+
# give progress update
176+
print(f"{i:6d}/{len(ips_ipv46)} [{100*i/len(ips_ipv46):04.1f}%]\r", file=sys.stderr, end='', flush=True)
177+
160178
if net_count[ip['net']] == max_per_net:
179+
# do not add this ip as we already too many
180+
# ips from this network
161181
continue
162182
asn = lookup_asn(ip['net'], ip['ip'])
163-
if asn is None or asn_count[asn] == max_per_asn:
183+
if asn is None or asn_count[asn] == max_per_asn[ip['net']]:
184+
# do not add this ip as we already have too many
185+
# ips from this ASN on this network
164186
continue
165187
asn_count[asn] += 1
166188
net_count[ip['net']] += 1
@@ -170,54 +192,55 @@ def filterbyasn(ips, max_per_asn, max_per_net):
170192
result.extend(ips_onion[0:max_per_net])
171193
return result
172194

173-
def ip_stats(ips):
174-
hist = collections.defaultdict(int)
195+
def ip_stats(ips: List[Dict]) -> str:
196+
""" Format and return pretty string from `ips`. """
197+
hist: Dict[str, int] = collections.defaultdict(int)
175198
for ip in ips:
176199
if ip is not None:
177200
hist[ip['net']] += 1
178201

179-
return '%6d %6d %6d' % (hist['ipv4'], hist['ipv6'], hist['onion'])
202+
return f"{hist['ipv4']:6d} {hist['ipv6']:6d} {hist['onion']:6d}"
180203

181204
def main():
182205
lines = sys.stdin.readlines()
183206
ips = [parseline(line) for line in lines]
184207

185208
print('\x1b[7m IPv4 IPv6 Onion Pass \x1b[0m', file=sys.stderr)
186-
print('%s Initial' % (ip_stats(ips)), file=sys.stderr)
209+
print(f'{ip_stats(ips):s} Initial', file=sys.stderr)
187210
# Skip entries with invalid address.
188211
ips = [ip for ip in ips if ip is not None]
189-
print('%s Skip entries with invalid address' % (ip_stats(ips)), file=sys.stderr)
212+
print(f'{ip_stats(ips):s} Skip entries with invalid address', file=sys.stderr)
190213
# Skip duplicates (in case multiple seeds files were concatenated)
191214
ips = dedup(ips)
192-
print('%s After removing duplicates' % (ip_stats(ips)), file=sys.stderr)
215+
print(f'{ip_stats(ips):s} After removing duplicates', file=sys.stderr)
193216
# Skip entries from suspicious hosts.
194217
ips = [ip for ip in ips if ip['ip'] not in SUSPICIOUS_HOSTS]
195-
print('%s Skip entries from suspicious hosts' % (ip_stats(ips)), file=sys.stderr)
218+
print(f'{ip_stats(ips):s} Skip entries from suspicious hosts', file=sys.stderr)
196219
# Enforce minimal number of blocks.
197220
ips = [ip for ip in ips if ip['blocks'] >= MIN_BLOCKS]
198-
print('%s Enforce minimal number of blocks' % (ip_stats(ips)), file=sys.stderr)
221+
print(f'{ip_stats(ips):s} Enforce minimal number of blocks', file=sys.stderr)
199222
# Require service bit 1.
200223
ips = [ip for ip in ips if (ip['service'] & 1) == 1]
201-
print('%s Require service bit 1' % (ip_stats(ips)), file=sys.stderr)
224+
print(f'{ip_stats(ips):s} Require service bit 1', file=sys.stderr)
202225
# Require at least 50% 30-day uptime for clearnet, 10% for onion.
203226
req_uptime = {
204227
'ipv4': 50,
205228
'ipv6': 50,
206229
'onion': 10,
207230
}
208231
ips = [ip for ip in ips if ip['uptime'] > req_uptime[ip['net']]]
209-
print('%s Require minimum uptime' % (ip_stats(ips)), file=sys.stderr)
232+
print(f'{ip_stats(ips):s} Require minimum uptime', file=sys.stderr)
210233
# Require a known and recent user agent.
211234
ips = [ip for ip in ips if PATTERN_AGENT.match(ip['agent'])]
212-
print('%s Require a known and recent user agent' % (ip_stats(ips)), file=sys.stderr)
235+
print(f'{ip_stats(ips):s} Require a known and recent user agent', file=sys.stderr)
213236
# Sort by availability (and use last success as tie breaker)
214237
ips.sort(key=lambda x: (x['uptime'], x['lastsuccess'], x['ip']), reverse=True)
215238
# Filter out hosts with multiple bitcoin ports, these are likely abusive
216239
ips = filtermultiport(ips)
217-
print('%s Filter out hosts with multiple bitcoin ports' % (ip_stats(ips)), file=sys.stderr)
240+
print(f'{ip_stats(ips):s} Filter out hosts with multiple bitcoin ports', file=sys.stderr)
218241
# Look up ASNs and limit results, both per ASN and globally.
219242
ips = filterbyasn(ips, MAX_SEEDS_PER_ASN, NSEEDS)
220-
print('%s Look up ASNs and limit results per ASN and per net' % (ip_stats(ips)), file=sys.stderr)
243+
print(f'{ip_stats(ips):s} Look up ASNs and limit results per ASN and per net', file=sys.stderr)
221244
# Sort the results by IP address (for deterministic output).
222245
ips.sort(key=lambda x: (x['net'], x['sortkey']))
223246
for ip in ips:

0 commit comments

Comments
 (0)