|
7 | 7 | import requests
|
8 | 8 | import secrets
|
9 | 9 |
|
| 10 | + |
| 11 | +MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" |
| 12 | +EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt" |
| 13 | + |
10 | 14 | class SiteInformation:
|
11 | 15 | def __init__(self, name, url_home, url_username_format, username_claimed,
|
12 | 16 | information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
|
@@ -67,12 +71,17 @@ def __str__(self):
|
67 | 71 | Return Value:
|
68 | 72 | Nicely formatted string to get information about this object.
|
69 | 73 | """
|
70 |
| - |
| 74 | + |
71 | 75 | return f"{self.name} ({self.url_home})"
|
72 | 76 |
|
73 | 77 |
|
74 | 78 | class SitesInformation:
|
75 |
| - def __init__(self, data_file_path=None): |
| 79 | + def __init__( |
| 80 | + self, |
| 81 | + data_file_path: str|None = None, |
| 82 | + honor_exclusions: bool = True, |
| 83 | + do_not_exclude: list[str] = [], |
| 84 | + ): |
76 | 85 | """Create Sites Information Object.
|
77 | 86 |
|
78 | 87 | Contains information about all supported websites.
|
@@ -110,7 +119,7 @@ def __init__(self, data_file_path=None):
|
110 | 119 | # The default data file is the live data.json which is in the GitHub repo. The reason why we are using
|
111 | 120 | # this instead of the local one is so that the user has the most up-to-date data. This prevents
|
112 | 121 | # users from creating issue about false positives which has already been fixed or having outdated data
|
113 |
| - data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" |
| 122 | + data_file_path = MANIFEST_URL |
114 | 123 |
|
115 | 124 | # Ensure that specified data file has correct extension.
|
116 | 125 | if not data_file_path.lower().endswith(".json"):
|
@@ -152,9 +161,31 @@ def __init__(self, data_file_path=None):
|
152 | 161 | raise FileNotFoundError(f"Problem while attempting to access "
|
153 | 162 | f"data file '{data_file_path}'."
|
154 | 163 | )
|
155 |
| - |
| 164 | + |
156 | 165 | site_data.pop('$schema', None)
|
157 | 166 |
|
| 167 | + if honor_exclusions: |
| 168 | + try: |
| 169 | + response = requests.get(url=EXCLUSIONS_URL) |
| 170 | + if response.status_code == 200: |
| 171 | + exclusions = response.text.splitlines() |
| 172 | + exclusions = [exclusion.strip() for exclusion in exclusions] |
| 173 | + |
| 174 | + for site in do_not_exclude: |
| 175 | + if site in exclusions: |
| 176 | + exclusions.remove(site) |
| 177 | + |
| 178 | + for exclusion in exclusions: |
| 179 | + try: |
| 180 | + site_data.pop(exclusion, None) |
| 181 | + except KeyError: |
| 182 | + pass |
| 183 | + |
| 184 | + except Exception: |
| 185 | + # If there was any problem loading the exclusions, just continue without them |
| 186 | + print("Warning: Could not load exclusions, continuing without them.") |
| 187 | + honor_exclusions = False |
| 188 | + |
158 | 189 | self.sites = {}
|
159 | 190 |
|
160 | 191 | # Add all site information from the json file to internal site list.
|
@@ -194,7 +225,7 @@ def remove_nsfw_sites(self, do_not_remove: list = []):
|
194 | 225 | for site in self.sites:
|
195 | 226 | if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
|
196 | 227 | continue
|
197 |
| - sites[site] = self.sites[site] |
| 228 | + sites[site] = self.sites[site] |
198 | 229 | self.sites = sites
|
199 | 230 |
|
200 | 231 | def site_name_list(self):
|
|
0 commit comments