Skip to content

Commit 2939b07

Browse files
committed
Add subdomains to ignore
1 parent 4aa8cd1 commit 2939b07

File tree

2 files changed

+95
-42
lines changed

2 files changed

+95
-42
lines changed

scripts/check-urls.py

Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from queue import SimpleQueue
1111

1212
from github_job_summary import JobSummary
13+
from subdomains import Subdomains
1314

1415
"""
1516
Read file names from stdin (feed from git ls-files)
@@ -18,6 +19,9 @@
1819
Check them with CURL
1920
"""
2021

22+
# To avoid 403 responses
23+
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
24+
2125

2226
class Curl:
2327
"""
@@ -49,35 +53,36 @@ class Curl:
4953
]
5054
)
5155

52-
IGNORE_DOMAINS: frozenset[str] = frozenset(
56+
IGNORE_DOMAINS = Subdomains(
5357
[
54-
"central.sonatype.org",
55-
"curl.se",
56-
"dart.dev",
57-
"getcomposer.org",
58-
"go.dev",
59-
"maven.apache.org",
60-
"mvnrepository.com",
61-
"mvnrepository.com",
62-
"nodejs.org",
63-
"packagist.org",
64-
"pkg.go.dev",
65-
"pub.dev",
66-
"pypi.org",
67-
"pypi.python.org",
68-
"repo1.maven.org",
69-
"tools.ietf.org",
70-
"urllib3.readthedocs.io",
71-
"www.apache.org",
72-
"www.dartlang.org",
73-
"www.gradle.org",
74-
"www.mojohaus.org",
75-
"www.npmjs.com",
76-
"www.nuget.org",
77-
"www.opensource.org",
78-
"www.php.net",
79-
"www.python.org",
80-
"www.w3.org",
58+
".android.com",
59+
".apache.org",
60+
".curl.se",
61+
".dart.dev",
62+
".dartlang.org",
63+
".getcomposer.org",
64+
".go.dev",
65+
".google.com",
66+
".gradle.org",
67+
".ietf.org",
68+
".maven.org",
69+
".microsoft.com",
70+
".mojohaus.org",
71+
".mvnrepository.com",
72+
".nodejs.org",
73+
".npmjs.com",
74+
".nuget.org",
75+
".opensource.org",
76+
".packagist.org",
77+
".php.net",
78+
".phpunit.de",
79+
".pub.dev",
80+
".pypi.org",
81+
".python.org",
82+
".readthedocs.io",
83+
".sonatype.org",
84+
".w3.org",
85+
".wikipedia.org",
8186
]
8287
)
8388

@@ -101,16 +106,7 @@ def valid_url(url: str) -> bool:
101106
if "." not in domain:
102107
# Ignore "localhost" and other domains without .
103108
return False
104-
if domain in IGNORE_DOMAINS:
105-
return False
106-
107-
if (
108-
domain.endswith("android.com")
109-
or domain.endswith(".google.com")
110-
or domain.endswith(".microsoft.com")
111-
or domain.endswith(".wikipedia.org")
112-
):
113-
# Ignore popular domain
109+
if IGNORE_DOMAINS.exists(domain):
114110
return False
115111

116112
if "{{" in url or "}}" in url:
@@ -159,8 +155,6 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
159155
class Task:
160156
_proc: subprocess.Popen[bytes]
161157
_stderr: str | None
162-
# To avoid 403 responses
163-
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
164158

165159
def __init__(self, url: str):
166160
self.url = url
@@ -171,7 +165,7 @@ def __init__(self, url: str):
171165
"--output",
172166
"-",
173167
"--user-agent",
174-
self.USER_AGENT,
168+
USER_AGENT,
175169
self.url,
176170
],
177171
stdout=open(os.devnull, "w"),
@@ -256,7 +250,7 @@ def url_checker(num_workers: int = 8) -> None:
256250
item = WORKER_QUEUE.get()
257251
if item is None:
258252
queue_is_empty = True
259-
print("URL queue is over")
253+
print("--- url queue is over ---")
260254
break
261255
url = item
262256
workers[i] = create_new_task(url)

scripts/subdomains.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import collections
2+
import typing
3+
from collections import defaultdict
4+
5+
6+
class Subdomains:
7+
plain_domains: set[str]
8+
domains_by_levels: tuple[tuple[int, list[tuple[str, ...]]], ...]
9+
10+
def __init__(self, domains: typing.Sequence[str]):
11+
self.plain_domains = set()
12+
13+
tmp_level_with_dom: defaultdict[int, list[tuple[str, ...]]] = collections.defaultdict(list)
14+
for d in domains:
15+
if d.startswith("."):
16+
level, parts = self.get_level(d)
17+
tmp_level_with_dom[level].append(parts)
18+
else:
19+
self.plain_domains.add(d)
20+
21+
# Ensure sorted by level
22+
self.domains_by_levels = tuple((key, tmp_level_with_dom[key]) for key in sorted(tmp_level_with_dom.keys()))
23+
24+
def exists(self, domain_name: str) -> bool:
25+
if domain_name in self.plain_domains:
26+
return True
27+
28+
level: int
29+
parts: tuple[str, ...]
30+
level, parts = self.get_level(domain_name)
31+
32+
domains: list[tuple[str, ...]]
33+
for known_level, domains in self.domains_by_levels:
34+
if known_level > level:
35+
# Do not search in upper domains
36+
# This means nothing could be found since search is from lower to upper
37+
return False
38+
39+
dom: tuple[str, ...]
40+
for dom in domains:
41+
if parts[:known_level] == tuple(dom):
42+
return True
43+
44+
return False
45+
46+
@staticmethod
47+
def get_level(domain_name: str) -> tuple[int, tuple[str, ...]]:
48+
parts = domain_name.strip(".").split(".")
49+
return len(parts), tuple(reversed(parts))
50+
51+
52+
def test() -> None:
53+
sd = Subdomains([".very.long.domain.name", "android.com", ".google.com"])
54+
assert sd.exists("test.google.com")
55+
assert not sd.exists("test.android.com")
56+
57+
58+
if __name__ == "__main__":
59+
test()

0 commit comments

Comments
 (0)