feat: honor automatic exclusions list

ppfeister · ppfeister · commit 888231045082 · 2025-09-15T21:56:54.000-04:00
diff --git a/.github/workflows/exclusions.yml b/.github/workflows/exclusions.yml
@@ -76,5 +76,5 @@ jobs:
 
           git add false_positive_exclusions.txt
 
-          git commit -m "auto: Update exclusions list" || echo "No changes to commit"
+          git commit -m "auto: update exclusions list" || echo "No changes to commit"
           git push origin exclusions
diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py
@@ -727,6 +727,14 @@ def main():
         help="Disable creation of a txt file",
     )
 
+    parser.add_argument(
+        "--ignore-exclusions",
+        action="store_true",
+        dest="ignore_exclusions",
+        default=False,
+        help="Ignore upstream exclusions (may return more false positives)",
+    )
+
     args = parser.parse_args()
 
     # If the user presses CTRL-C, exit gracefully without throwing errors
@@ -784,7 +792,8 @@ def main():
     try:
         if args.local:
             sites = SitesInformation(
-                os.path.join(os.path.dirname(__file__), "resources/data.json")
+                os.path.join(os.path.dirname(__file__), "resources/data.json"),
+                honor_exclusions=False,
             )
         else:
             json_file_location = args.json_file
@@ -804,7 +813,11 @@ def main():
                     head_commit_sha = pull_request_json["head"]["sha"]
                     json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"
 
-            sites = SitesInformation(json_file_location)
+            sites = SitesInformation(
+                data_file_path=json_file_location,
+                honor_exclusions=not args.ignore_exclusions,
+                do_not_exclude=args.site_list,
+            )
     except Exception as error:
         print(f"ERROR:  {error}")
         sys.exit(1)
diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py
@@ -7,6 +7,10 @@
 import requests
 import secrets
 
+
+MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
+EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
+
 class SiteInformation:
     def __init__(self, name, url_home, url_username_format, username_claimed,
                 information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@@ -67,12 +71,17 @@ def __str__(self):
         Return Value:
         Nicely formatted string to get information about this object.
         """
-        
+
         return f"{self.name} ({self.url_home})"
 
 
 class SitesInformation:
-    def __init__(self, data_file_path=None):
+    def __init__(
+            self,
+            data_file_path: str|None = None,
+            honor_exclusions: bool = True,
+            do_not_exclude: list[str] = [],
+        ):
         """Create Sites Information Object.
 
         Contains information about all supported websites.
@@ -110,7 +119,7 @@ def __init__(self, data_file_path=None):
             # The default data file is the live data.json which is in the GitHub repo. The reason why we are using
             # this instead of the local one is so that the user has the most up-to-date data. This prevents
             # users from creating issue about false positives which has already been fixed or having outdated data
-            data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
+            data_file_path = MANIFEST_URL
 
         # Ensure that specified data file has correct extension.
         if not data_file_path.lower().endswith(".json"):
@@ -152,9 +161,31 @@ def __init__(self, data_file_path=None):
                 raise FileNotFoundError(f"Problem while attempting to access "
                                         f"data file '{data_file_path}'."
                                         )
-        
+
         site_data.pop('$schema', None)
 
+        if honor_exclusions:
+            try:
+                response = requests.get(url=EXCLUSIONS_URL)
+                if response.status_code == 200:
+                    exclusions = response.text.splitlines()
+                    exclusions = [exclusion.strip() for exclusion in exclusions]
+
+                    for site in do_not_exclude:
+                        if site in exclusions:
+                            exclusions.remove(site)
+
+                    for exclusion in exclusions:
+                        try:
+                            site_data.pop(exclusion, None)
+                        except KeyError:
+                            pass
+
+            except Exception:
+                # If there was any problem loading the exclusions, just continue without them
+                print("Warning: Could not load exclusions, continuing without them.")
+                honor_exclusions = False
+
         self.sites = {}
 
         # Add all site information from the json file to internal site list.
@@ -194,7 +225,7 @@ def remove_nsfw_sites(self, do_not_remove: list = []):
         for site in self.sites:
             if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
                 continue
-            sites[site] = self.sites[site]  
+            sites[site] = self.sites[site]
         self.sites =  sites
 
     def site_name_list(self):