diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index b7aaf4c58..b5efe1584 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -145,6 +145,9 @@ def __init__( raise ValueError( f"Problem parsing json contents at '{data_file_path}': {error}." ) + + # Validate remote manifest against local schema + self._validate_remote_manifest(site_data, data_file_path) else: # Reference is to a file. @@ -210,6 +213,22 @@ def __init__( return + def _validate_remote_manifest(self, manifest_data, data_file_path): + """Validate remote manifest against local schema to prevent runtime errors from schema drift.""" + try: + from jsonschema import validate, ValidationError + import os + schema_path = os.path.join(os.path.dirname(__file__), "resources", "data.schema.json") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + validate(instance=manifest_data, schema=schema) + except ImportError: + print("Warning: jsonschema not available, skipping manifest validation.") + except ValidationError as e: + raise ValueError(f"Remote manifest validation failed: {e.message}\nThis may indicate schema drift.") + except FileNotFoundError: + print("Warning: Local schema file not found, skipping validation.") + def remove_nsfw_sites(self, do_not_remove: list = []): """ Remove NSFW sites from the sites, if isNSFW flag is true for site