Merge pull request #2536 from sherlock-project/feature/username_fuzz

ppfeister · web-flow · commit e09319f29f89 · 2025-09-15T21:05:35.000-04:00
Return support for F+/F- detection via fuzzing
diff --git a/.github/workflows/exclusions.yml b/.github/workflows/exclusions.yml
@@ -0,0 +1,64 @@
+name: Exclusions Updater
+
+on:
+  schedule:
+    #- cron: '0 5 * * 0'  # Runs at 05:00 every Sunday
+    - cron: '0 5 * * *' # Runs at 05:00 every day
+  workflow_dispatch:
+
+jobs:
+  update-exclusions:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Poetry
+        uses: abatilo/actions-poetry@v4
+        with:
+          poetry-version: 'latest'
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --with dev
+
+      - name: Run false positive tests
+        run: |
+          $(poetry env activate)
+          pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
+          deactivate
+
+      - name: Parse false positive detections by desired categories
+        id: parse_detections
+        run: |
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
+            | sort -u > false_positive_exclusions.txt
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
+            | sort -u > waf_hits.txt
+
+      - name: Quantify and display results
+        run: |
+          FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
+          WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
+          echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
+          echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
+          echo ">>> WAF hits:" && cat waf_hits.txt
+
+      - name: Commit and push exclusions list
+        if: steps.parse_detections.outputs.changed == 'true' || steps.parse_detections.outputs.changed == 'true'
+        run: |
+          git config user.name "Paul Pfeister (automation)"
+          git config user.email "code@pfeister.dev"
+
+          git fetch origin exclusions || true # Allows creation of branch if deleted
+          git checkout -B exclusions origin/exclusions || git checkout --orphan exclusions
+
+          git add false_positive_exclusions.txt
+
+          git commit -m "auto: Update exclusions list" || echo "No changes to commit"
+          git push origin exclusions
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -49,10 +49,10 @@ jobs:
           macos-latest,
         ]
         python-version: [
-          '3.9',
           '3.10',
           '3.11',
           '3.12',
+          '3.13',
         ]
     steps:
       - uses: actions/checkout@v4
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,6 +56,9 @@ tor = ["torrequest"]
 
 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
+rstr = "^3.2.2"
+pytest = "^8.4.2"
+pytest-xdist = "^3.8.0"
 
 [tool.poetry.scripts]
 sherlock = 'sherlock_project.sherlock:main'
diff --git a/pytest.ini b/pytest.ini
@@ -1,4 +1,7 @@
 [pytest]
-addopts = --strict-markers
+addopts = --strict-markers -m "not validate_targets"
 markers =
     online: mark tests are requiring internet access.
+    validate_targets: mark tests for sweeping manifest validation (sends many requests).
+    validate_targets_fp: validate_targets, false positive tests only.
+    validate_targets_fn: validate_targets, false negative tests only.
diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py
@@ -169,14 +169,14 @@ def multiple_usernames(username):
 
 def sherlock(
     username: str,
-    site_data: dict,
+    site_data: dict[str, dict[str, str]],
     query_notify: QueryNotify,
     tor: bool = False,
     unique_tor: bool = False,
     dump_response: bool = False,
     proxy: Optional[str] = None,
     timeout: int = 60,
-):
+) -> dict[str, dict[str, str | QueryResult]]:
     """Run Sherlock Analysis.
 
     Checks for existence of username on various social media sites.
@@ -507,7 +507,7 @@ def sherlock(
             print("+++++++++++++++++++++")
 
         # Notify caller about results of query.
-        result = QueryResult(
+        result: QueryResult = QueryResult(
             username=username,
             site_name=social_network,
             site_url_user=url,
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,20 +4,30 @@
 import pytest
 from sherlock_project.sites import SitesInformation
 
+def fetch_local_manifest() -> dict[str, dict[str, str]]:
+    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
+    sites_iterable = {site.name: site.information for site in sites_obj}
+    return sites_iterable
+
 @pytest.fixture()
 def sites_obj():
     sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
     yield sites_obj
 
 @pytest.fixture(scope="session")
 def sites_info():
-    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
-    sites_iterable = {site.name: site.information for site in sites_obj}
-    yield sites_iterable
+    yield fetch_local_manifest()
 
 @pytest.fixture(scope="session")
 def remote_schema():
     schema_url: str = 'https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.schema.json'
     with urllib.request.urlopen(schema_url) as remoteschema:
         schemadat = json.load(remoteschema)
     yield schemadat
+
+def pytest_generate_tests(metafunc):
+    if "chunked_sites" in metafunc.fixturenames:
+        sites_info = fetch_local_manifest()
+        params = [{name: data} for name, data in sites_info.items()]
+        ids = list(sites_info.keys())
+        metafunc.parametrize("chunked_sites", params, ids=ids)
diff --git a/tests/test_manifest.py b/tests/test_manifest.py
@@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
     """Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
     json_relative: str = '../sherlock_project/resources/data.json'
     schema_relative: str = '../sherlock_project/resources/data.schema.json'
-    
+
     json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
     schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)
 
diff --git a/tests/test_validate_targets.py b/tests/test_validate_targets.py
@@ -0,0 +1,99 @@
+import pytest
+import re
+import rstr
+
+from sherlock_project.sherlock import sherlock
+from sherlock_project.notify import QueryNotify
+from sherlock_project.result import QueryResult, QueryStatus
+
+
+FALSE_POSITIVE_ATTEMPTS: int = 2    # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
+FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15  # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable)
+FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$'  # Used in absence of a regexCheck entry
+
+
+def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str:
+    """Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
+    def replace_upper_bound(match: re.Match) -> str: # type: ignore
+        lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
+        upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore  # noqa: F823
+        return f'{{{lower_bound},{upper_bound}}}'
+
+    pattern = re.sub(r'(?<!\\)\{(\d+),\}', replace_upper_bound, pattern) # {n,} # type: ignore
+    pattern = re.sub(r'(?<!\\)\+', f'{{1,{upper_bound}}}', pattern) # +
+    pattern = re.sub(r'(?<!\\)\*', f'{{0,{upper_bound}}}', pattern) # *
+
+    return pattern
+
+def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
+    """Check if a site is likely to produce false positives."""
+    status: QueryStatus = QueryStatus.UNKNOWN
+
+    for _ in range(FALSE_POSITIVE_ATTEMPTS):
+        query_notify: QueryNotify = QueryNotify()
+        username: str = rstr.xeger(pattern)
+
+        result: QueryResult | str = sherlock(
+            username=username,
+            site_data=sites_info,
+            query_notify=query_notify,
+        )[site]['status']
+
+        if not hasattr(result, 'status'):
+            raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
+        if type(result.status) is not QueryStatus: # type: ignore
+            raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
+        status = result.status # type: ignore
+
+        if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
+            return status
+
+    return status
+
+
+def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus:
+    """Check if a site is likely to produce false negatives."""
+    status: QueryStatus = QueryStatus.UNKNOWN
+    query_notify: QueryNotify = QueryNotify()
+
+    result: QueryResult | str = sherlock(
+        username=sites_info[site]['username_claimed'],
+        site_data=sites_info,
+        query_notify=query_notify,
+    )[site]['status']
+
+    if not hasattr(result, 'status'):
+            raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
+    if type(result.status) is not QueryStatus: # type: ignore
+        raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
+    status = result.status # type: ignore
+
+    return status
+
+@pytest.mark.validate_targets
+@pytest.mark.online
+class Test_All_Targets:
+
+    @pytest.mark.validate_targets_fp
+    def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
+        """Iterate through all sites in the manifest to discover possible false-positive inducting targets."""
+        pattern: str
+        for site in chunked_sites:
+            try:
+                pattern = chunked_sites[site]['regexCheck']
+            except KeyError:
+                pattern = FALSE_POSITIVE_DEFAULT_PATTERN
+
+            if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0:
+                pattern = set_pattern_upper_bound(pattern)
+
+            result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
+            assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
+
+    @pytest.mark.validate_targets_fn
+    def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]):
+        """Iterate through all sites in the manifest to discover possible false-negative inducting targets."""
+        for site in chunked_sites:
+            result: QueryStatus = false_negative_check(chunked_sites, site)
+            assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}"
+
diff --git a/tox.ini b/tox.ini
@@ -7,15 +7,14 @@ envlist =
     py312
     py311
     py310
-    py39
-    py38
 
 [testenv]
 description = Attempt to build and install the package
 deps =
     coverage
     jsonschema
     pytest
+    rstr
 allowlist_externals = coverage
 commands =
     coverage run --source=sherlock_project --module pytest -v
@@ -37,7 +36,7 @@ commands =
 
 [gh-actions]
 python =
+    3.13: py313
     3.12: py312
     3.11: py311
     3.10: py310
-    3.9: py39

Original file line number	Diff line number	Diff line change
`@@ -49,10 +49,10 @@ jobs:`
`49`	`49`	`macos-latest,`
`50`	`50`	`]`
`51`	`51`	`python-version: [`
`52`		`- '3.9',`
`53`	`52`	`'3.10',`
`54`	`53`	`'3.11',`
`55`	`54`	`'3.12',`
	`55`	`+ '3.13',`
`56`	`56`	`]`
`57`	`57`	`steps:`
`58`	`58`	`- uses: actions/checkout@v4`