sherlock-project
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/exclusions.yml‎
Lines changed: 89 additions & 0 deletions b/‎.github/workflows/exclusions.yml‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎.github/workflows/regression.yml‎
Lines changed: 26 additions & 3 deletions b/‎.github/workflows/regression.yml‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎.github/workflows/validate_modified_targets.yml‎
Lines changed: 100 additions & 0 deletions b/‎.github/workflows/validate_modified_targets.yml‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎devel/summarize_site_validation.py‎
Lines changed: 72 additions & 0 deletions b/‎devel/summarize_site_validation.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎docs/README.md‎
Lines changed: 5 additions & 6 deletions b/‎docs/README.md‎
Lines changed: 5 additions & 6 deletions
@@ -1,5 +1,5 @@
 ### REPOSITORY
-/.github/CODEOWNERS @sdushantha
+/.github/CODEOWNERS @sdushantha @ppfeister
 /.github/FUNDING.yml @sdushantha
 /LICENSE @sdushantha
 
 
@@ -0,0 +1,89 @@
+name: Exclusions Updater
+
+on:
+  schedule:
+    #- cron: '0 5 * * 0'  # Runs at 05:00 every Sunday
+    - cron: '0 5 * * *' # Runs at 05:00 every day
+  workflow_dispatch:
+
+jobs:
+  update-exclusions:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Poetry
+        uses: abatilo/actions-poetry@v4
+        with:
+          poetry-version: 'latest'
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --with dev
+
+      - name: Run false positive tests
+        run: |
+          $(poetry env activate)
+          pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
+          deactivate
+
+      - name: Parse false positive detections by desired categories
+        run: |
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
+            | sort -u > false_positive_exclusions.txt
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
+            | sort -u > waf_hits.txt
+
+      - name: Detect if exclusions list changed
+        id: detect_changes
+        run: |
+          git fetch origin exclusions || true
+
+          if git show origin/exclusions:false_positive_exclusions.txt >/dev/null 2>&1; then
+            # If the exclusions branch and file exist, compare
+            if git diff --quiet origin/exclusions -- false_positive_exclusions.txt; then
+              echo "exclusions_changed=false" >> "$GITHUB_OUTPUT"
+            else
+              echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
+            fi
+          else
+            # If the exclusions branch or file do not exist, treat as changed
+            echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Quantify and display results
+        run: |
+          FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
+          WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
+          echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
+          echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
+          echo ">>> WAF hits:" && cat waf_hits.txt
+
+      - name: Commit and push exclusions list
+        if: steps.detect_changes.outputs.exclusions_changed == 'true'
+        run: |
+          git config user.name "Paul Pfeister (automation)"
+          git config user.email "[email protected]"
+
+          mv false_positive_exclusions.txt false_positive_exclusions.txt.tmp
+
+          git add -f false_positive_exclusions.txt.tmp # -f required to override .gitignore
+          git stash push -m "stash false positive exclusion list" -- false_positive_exclusions.txt.tmp
+
+          git fetch origin exclusions || true # Allows creation of branch if deleted
+          git checkout -B exclusions origin/exclusions || (git checkout --orphan exclusions && git rm -rf .)
+
+          git stash pop || true
+
+          mv false_positive_exclusions.txt.tmp false_positive_exclusions.txt
+
+          git rm -f false_positive_exclusions.txt.tmp || true
+          git add false_positive_exclusions.txt
+          git commit -m "auto: update exclusions list" || echo "No changes to commit"
+          git push origin exclusions
@@ -11,6 +11,7 @@ on:
       - '**/*.py'
       - '**/*.ini'
       - '**/*.toml'
+      - 'Dockerfile'
   push:
     branches:
       - master
@@ -21,11 +22,13 @@ on:
       - '**/*.py'
       - '**/*.ini'
       - '**/*.toml'
+      - 'Dockerfile'
 
 jobs:
   tox-lint:
-    # Linting is ran through tox to ensure that the same linter is used by local runners
     runs-on: ubuntu-latest
+    # Linting is ran through tox to ensure that the same linter
+    # is used by local runners
     steps:
       - uses: actions/checkout@v4
       - name: Set up linting environment
@@ -41,18 +44,19 @@ jobs:
   tox-matrix:
     runs-on: ${{ matrix.os }}
     strategy:
-      fail-fast: false # We want to know what specicic versions it fails on
+      # We want to know what specicic versions it fails on
+      fail-fast: false
       matrix:
         os: [
           ubuntu-latest,
           windows-latest,
           macos-latest,
         ]
         python-version: [
-          '3.9',
           '3.10',
           '3.11',
           '3.12',
+          '3.13',
         ]
     steps:
       - uses: actions/checkout@v4
@@ -67,3 +71,22 @@ jobs:
           pip install tox-gh-actions
       - name: Run tox
         run: tox
+  docker-build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Get version from pyproject.toml
+        id: get-version
+        run: |
+          VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+      - name: Build Docker image
+        run: |
+          docker build \
+            --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
+            -t sherlock-test:latest .
+      - name: Test Docker image runs
+        run: docker run --rm sherlock-test:latest --version
@@ -0,0 +1,100 @@
+name: Modified Target Validation
+
+on:
+  pull_request_target:
+    branches:
+      - master
+    paths:
+      - "sherlock_project/resources/data.json"
+
+jobs:
+  validate-modified-targets:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ github.base_ref }}
+          fetch-depth: 1
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Poetry
+        uses: abatilo/actions-poetry@v4
+        with:
+          poetry-version: 'latest'
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --with dev
+
+      - name: Drop in place updated manifest from base
+        run: |
+          cp sherlock_project/resources/data.json data.json.base
+          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1
+          git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json
+          cp sherlock_project/resources/data.json data.json.head
+
+      - name: Discover modified targets
+        id: discover-modified
+        run: |
+          CHANGED=$(
+            python - <<'EOF'
+          import json
+          with open("data.json.base") as f: base = json.load(f)
+          with open("data.json.head") as f: head = json.load(f)
+
+          changed = []
+          for k, v in head.items():
+              if k not in base or base[k] != v:
+                  changed.append(k)
+
+          print(",".join(sorted(changed)))
+          EOF
+          )
+
+          # Preserve changelist
+          echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
+          echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
+
+      - name: Validate modified targets
+        if: steps.discover-modified.outputs.changed_targets != ''
+        continue-on-error: true
+        run: |
+          poetry run pytest -q --tb no -rA -m validate_targets -n 20 \
+            --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" \
+            --junitxml=validation_results.xml
+
+      - name: Prepare validation summary
+        if: steps.discover-modified.outputs.changed_targets != ''
+        id: prepare-summary
+        run: |
+          summary=$(
+            poetry run python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results"
+          )
+          echo "$summary" > validation_summary.md
+
+      - name: Announce validation results
+        if: steps.discover-modified.outputs.changed_targets != ''
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('validation_summary.md', 'utf8');
+            await github.rest.issues.createComment({
+              issue_number: context.payload.pull_request.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body,
+            });
+
+      - name: This step shows as ran when no modifications are found
+        if: steps.discover-modified.outputs.changed_targets == ''
+        run: |
+          echo "No modified targets found"
@@ -2,9 +2,9 @@
   # 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py
   # 2. Update the VCS_REF tag to match the tagged version's FULL commit hash
   # 3. Build image with BOTH latest and version tags
-    # i.e. `docker build -t sherlock/sherlock:0.15.0 -t sherlock/sherlock:latest .`
+    # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
 
-FROM python:3.12-slim-bullseye as build
+FROM python:3.12-slim-bullseye AS build
 WORKDIR /sherlock
 
 RUN pip3 install --no-cache-dir --upgrade pip
 
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# This module summarizes the results of site validation tests queued by
+# workflow validate_modified_targets for presentation in Issue comments.
+
+from defusedxml import ElementTree as ET
+import sys
+from pathlib import Path
+
+def summarize_junit_xml(xml_path: Path) -> str:
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    suite = root.find('testsuite')
+
+    pass_message: str = ":heavy_check_mark: &nbsp; Pass"
+    fail_message: str = ":x: &nbsp; Fail"
+
+    if suite is None:
+        raise ValueError("Invalid JUnit XML: No testsuite found")
+
+    summary_lines: list[str] = []
+    summary_lines.append("#### Automatic validation of changes\n")
+    summary_lines.append("| Target | F+ Check | F- Check |")
+    summary_lines.append("|---|---|---|")
+
+    failures = int(suite.get('failures', 0))
+    errors_detected: bool = False
+
+    results: dict[str, dict[str, str]] = {}
+
+    for testcase in suite.findall('testcase'):
+        test_name = testcase.get('name').split('[')[0]
+        site_name = testcase.get('name').split('[')[1].rstrip(']')
+        failure = testcase.find('failure')
+        error = testcase.find('error')
+
+        if site_name not in results:
+            results[site_name] = {}
+
+        if test_name == "test_false_neg":
+            results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message
+        elif test_name == "test_false_pos":
+            results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message
+
+        if error is not None:
+            errors_detected = True
+
+    for result in results:
+        summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |")
+
+    if failures > 0:
+        summary_lines.append("\n___\n" +
+            "\nFailures were detected on at least one updated target. Commits containing accuracy failures" +
+            " will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).")
+
+    if errors_detected:
+        summary_lines.append("\n___\n" +
+            "\n**Errors were detected during validation. Please review the workflow logs.**")
+
+    return "\n".join(summary_lines)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: summarize_site_validation.py <junit-xml-file>")
+        sys.exit(1)
+
+    xml_path: Path = Path(sys.argv[1])
+    if not xml_path.is_file():
+        print(f"Error: File '{xml_path}' does not exist.")
+        sys.exit(1)
+
+    summary: str = summarize_junit_xml(xml_path)
+    print(summary)
@@ -1,6 +1,6 @@
-<p align=center>
+<p align="center">
   <br>
-  <a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png"/></a>
+  <a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png" alt="sherlock"/></a>
   <br>
   <span>Hunt down social media accounts by username across <a href="https://sherlockproject.xyz/sites">400+ social networks</a></span>
   <br>
@@ -15,8 +15,7 @@
 </p>
 
 <p align="center">
-<img width="70%" height="70%" src="images/demo.png"/>
-</a>
+<img width="70%" height="70%" src="images/demo.png" alt="demo"/>
 </p>
 
 
@@ -115,14 +114,14 @@ $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
 }]
 ```
 
-Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmaticaly via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
+Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
 
 ## Credits
 
 Thank you to everyone who has contributed to Sherlock! ❤️
 
 <a href="https://github.com/sherlock-project/sherlock/graphs/contributors">
-  <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" noZoom />
+  <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
 </a>
 
 ## Star history