Skip to content

Commit 8882310

Browse files
committed
feat: honor automatic exclusions list
1 parent 6d15f13 commit 8882310

File tree

3 files changed

+52
-8
lines changed

3 files changed

+52
-8
lines changed

.github/workflows/exclusions.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,5 +76,5 @@ jobs:
7676
7777
git add false_positive_exclusions.txt
7878
79-
git commit -m "auto: Update exclusions list" || echo "No changes to commit"
79+
git commit -m "auto: update exclusions list" || echo "No changes to commit"
8080
git push origin exclusions

sherlock_project/sherlock.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,14 @@ def main():
727727
help="Disable creation of a txt file",
728728
)
729729

730+
parser.add_argument(
731+
"--ignore-exclusions",
732+
action="store_true",
733+
dest="ignore_exclusions",
734+
default=False,
735+
help="Ignore upstream exclusions (may return more false positives)",
736+
)
737+
730738
args = parser.parse_args()
731739

732740
# If the user presses CTRL-C, exit gracefully without throwing errors
@@ -784,7 +792,8 @@ def main():
784792
try:
785793
if args.local:
786794
sites = SitesInformation(
787-
os.path.join(os.path.dirname(__file__), "resources/data.json")
795+
os.path.join(os.path.dirname(__file__), "resources/data.json"),
796+
honor_exclusions=False,
788797
)
789798
else:
790799
json_file_location = args.json_file
@@ -804,7 +813,11 @@ def main():
804813
head_commit_sha = pull_request_json["head"]["sha"]
805814
json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"
806815

807-
sites = SitesInformation(json_file_location)
816+
sites = SitesInformation(
817+
data_file_path=json_file_location,
818+
honor_exclusions=not args.ignore_exclusions,
819+
do_not_exclude=args.site_list,
820+
)
808821
except Exception as error:
809822
print(f"ERROR: {error}")
810823
sys.exit(1)

sherlock_project/sites.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
import requests
88
import secrets
99

10+
11+
MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
12+
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
13+
1014
class SiteInformation:
1115
def __init__(self, name, url_home, url_username_format, username_claimed,
1216
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@@ -67,12 +71,17 @@ def __str__(self):
6771
Return Value:
6872
Nicely formatted string to get information about this object.
6973
"""
70-
74+
7175
return f"{self.name} ({self.url_home})"
7276

7377

7478
class SitesInformation:
75-
def __init__(self, data_file_path=None):
79+
def __init__(
80+
self,
81+
data_file_path: str|None = None,
82+
honor_exclusions: bool = True,
83+
do_not_exclude: list[str] = [],
84+
):
7685
"""Create Sites Information Object.
7786
7887
Contains information about all supported websites.
@@ -110,7 +119,7 @@ def __init__(self, data_file_path=None):
110119
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using
111120
# this instead of the local one is so that the user has the most up-to-date data. This prevents
112121
# users from creating issue about false positives which has already been fixed or having outdated data
113-
data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
122+
data_file_path = MANIFEST_URL
114123

115124
# Ensure that specified data file has correct extension.
116125
if not data_file_path.lower().endswith(".json"):
@@ -152,9 +161,31 @@ def __init__(self, data_file_path=None):
152161
raise FileNotFoundError(f"Problem while attempting to access "
153162
f"data file '{data_file_path}'."
154163
)
155-
164+
156165
site_data.pop('$schema', None)
157166

167+
if honor_exclusions:
168+
try:
169+
response = requests.get(url=EXCLUSIONS_URL)
170+
if response.status_code == 200:
171+
exclusions = response.text.splitlines()
172+
exclusions = [exclusion.strip() for exclusion in exclusions]
173+
174+
for site in do_not_exclude:
175+
if site in exclusions:
176+
exclusions.remove(site)
177+
178+
for exclusion in exclusions:
179+
try:
180+
site_data.pop(exclusion, None)
181+
except KeyError:
182+
pass
183+
184+
except Exception:
185+
# If there was any problem loading the exclusions, just continue without them
186+
print("Warning: Could not load exclusions, continuing without them.")
187+
honor_exclusions = False
188+
158189
self.sites = {}
159190

160191
# Add all site information from the json file to internal site list.
@@ -194,7 +225,7 @@ def remove_nsfw_sites(self, do_not_remove: list = []):
194225
for site in self.sites:
195226
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
196227
continue
197-
sites[site] = self.sites[site]
228+
sites[site] = self.sites[site]
198229
self.sites = sites
199230

200231
def site_name_list(self):

0 commit comments

Comments
 (0)