Skip to content

Commit e09319f

Browse files
authored
Merge pull request #2536 from sherlock-project/feature/username_fuzz
Return support for F+/F- detection via fuzzing
2 parents 284662e + b152428 commit e09319f

File tree

9 files changed

+190
-12
lines changed

9 files changed

+190
-12
lines changed

.github/workflows/exclusions.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
name: Exclusions Updater
2+
3+
on:
4+
schedule:
5+
#- cron: '0 5 * * 0' # Runs at 05:00 every Sunday
6+
- cron: '0 5 * * *' # Runs at 05:00 every day
7+
workflow_dispatch:
8+
9+
jobs:
10+
update-exclusions:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout repository
14+
uses: actions/checkout@v5
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v6
18+
with:
19+
python-version: '3.13'
20+
21+
- name: Install Poetry
22+
uses: abatilo/actions-poetry@v4
23+
with:
24+
poetry-version: 'latest'
25+
26+
- name: Install dependencies
27+
run: |
28+
poetry install --no-interaction --with dev
29+
30+
- name: Run false positive tests
31+
run: |
32+
$(poetry env activate)
33+
pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
34+
deactivate
35+
36+
- name: Parse false positive detections by desired categories
37+
id: parse_detections
38+
run: |
39+
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
40+
| sort -u > false_positive_exclusions.txt
41+
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
42+
| sort -u > waf_hits.txt
43+
44+
- name: Quantify and display results
45+
run: |
46+
FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
47+
WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
48+
echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
49+
echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
50+
echo ">>> WAF hits:" && cat waf_hits.txt
51+
52+
- name: Commit and push exclusions list
53+
if: steps.parse_detections.outputs.changed == 'true' || steps.parse_detections.outputs.changed == 'true'
54+
run: |
55+
git config user.name "Paul Pfeister (automation)"
56+
git config user.email "[email protected]"
57+
58+
git fetch origin exclusions || true # Allows creation of branch if deleted
59+
git checkout -B exclusions origin/exclusions || git checkout --orphan exclusions
60+
61+
git add false_positive_exclusions.txt
62+
63+
git commit -m "auto: Update exclusions list" || echo "No changes to commit"
64+
git push origin exclusions

.github/workflows/regression.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ jobs:
4949
macos-latest,
5050
]
5151
python-version: [
52-
'3.9',
5352
'3.10',
5453
'3.11',
5554
'3.12',
55+
'3.13',
5656
]
5757
steps:
5858
- uses: actions/checkout@v4

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ tor = ["torrequest"]
5656

5757
[tool.poetry.group.dev.dependencies]
5858
jsonschema = "^4.0.0"
59+
rstr = "^3.2.2"
60+
pytest = "^8.4.2"
61+
pytest-xdist = "^3.8.0"
5962

6063
[tool.poetry.scripts]
6164
sherlock = 'sherlock_project.sherlock:main'

pytest.ini

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
[pytest]
2-
addopts = --strict-markers
2+
addopts = --strict-markers -m "not validate_targets"
33
markers =
44
online: mark tests are requiring internet access.
5+
validate_targets: mark tests for sweeping manifest validation (sends many requests).
6+
validate_targets_fp: validate_targets, false positive tests only.
7+
validate_targets_fn: validate_targets, false negative tests only.

sherlock_project/sherlock.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,14 +169,14 @@ def multiple_usernames(username):
169169

170170
def sherlock(
171171
username: str,
172-
site_data: dict,
172+
site_data: dict[str, dict[str, str]],
173173
query_notify: QueryNotify,
174174
tor: bool = False,
175175
unique_tor: bool = False,
176176
dump_response: bool = False,
177177
proxy: Optional[str] = None,
178178
timeout: int = 60,
179-
):
179+
) -> dict[str, dict[str, str | QueryResult]]:
180180
"""Run Sherlock Analysis.
181181
182182
Checks for existence of username on various social media sites.
@@ -507,7 +507,7 @@ def sherlock(
507507
print("+++++++++++++++++++++")
508508

509509
# Notify caller about results of query.
510-
result = QueryResult(
510+
result: QueryResult = QueryResult(
511511
username=username,
512512
site_name=social_network,
513513
site_url_user=url,

tests/conftest.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,30 @@
44
import pytest
55
from sherlock_project.sites import SitesInformation
66

7+
def fetch_local_manifest() -> dict[str, dict[str, str]]:
8+
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
9+
sites_iterable = {site.name: site.information for site in sites_obj}
10+
return sites_iterable
11+
712
@pytest.fixture()
813
def sites_obj():
914
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
1015
yield sites_obj
1116

1217
@pytest.fixture(scope="session")
1318
def sites_info():
14-
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
15-
sites_iterable = {site.name: site.information for site in sites_obj}
16-
yield sites_iterable
19+
yield fetch_local_manifest()
1720

1821
@pytest.fixture(scope="session")
1922
def remote_schema():
2023
schema_url: str = 'https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.schema.json'
2124
with urllib.request.urlopen(schema_url) as remoteschema:
2225
schemadat = json.load(remoteschema)
2326
yield schemadat
27+
28+
def pytest_generate_tests(metafunc):
29+
if "chunked_sites" in metafunc.fixturenames:
30+
sites_info = fetch_local_manifest()
31+
params = [{name: data} for name, data in sites_info.items()]
32+
ids = list(sites_info.keys())
33+
metafunc.parametrize("chunked_sites", params, ids=ids)

tests/test_manifest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
77
"""Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
88
json_relative: str = '../sherlock_project/resources/data.json'
99
schema_relative: str = '../sherlock_project/resources/data.schema.json'
10-
10+
1111
json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
1212
schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)
1313

tests/test_validate_targets.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import pytest
2+
import re
3+
import rstr
4+
5+
from sherlock_project.sherlock import sherlock
6+
from sherlock_project.notify import QueryNotify
7+
from sherlock_project.result import QueryResult, QueryStatus
8+
9+
10+
FALSE_POSITIVE_ATTEMPTS: int = 2 # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
11+
FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15 # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable)
12+
FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$' # Used in absence of a regexCheck entry
13+
14+
15+
def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str:
16+
"""Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
17+
def replace_upper_bound(match: re.Match) -> str: # type: ignore
18+
lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
19+
upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore # noqa: F823
20+
return f'{{{lower_bound},{upper_bound}}}'
21+
22+
pattern = re.sub(r'(?<!\\)\{(\d+),\}', replace_upper_bound, pattern) # {n,} # type: ignore
23+
pattern = re.sub(r'(?<!\\)\+', f'{{1,{upper_bound}}}', pattern) # +
24+
pattern = re.sub(r'(?<!\\)\*', f'{{0,{upper_bound}}}', pattern) # *
25+
26+
return pattern
27+
28+
def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
29+
"""Check if a site is likely to produce false positives."""
30+
status: QueryStatus = QueryStatus.UNKNOWN
31+
32+
for _ in range(FALSE_POSITIVE_ATTEMPTS):
33+
query_notify: QueryNotify = QueryNotify()
34+
username: str = rstr.xeger(pattern)
35+
36+
result: QueryResult | str = sherlock(
37+
username=username,
38+
site_data=sites_info,
39+
query_notify=query_notify,
40+
)[site]['status']
41+
42+
if not hasattr(result, 'status'):
43+
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
44+
if type(result.status) is not QueryStatus: # type: ignore
45+
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
46+
status = result.status # type: ignore
47+
48+
if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
49+
return status
50+
51+
return status
52+
53+
54+
def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus:
55+
"""Check if a site is likely to produce false negatives."""
56+
status: QueryStatus = QueryStatus.UNKNOWN
57+
query_notify: QueryNotify = QueryNotify()
58+
59+
result: QueryResult | str = sherlock(
60+
username=sites_info[site]['username_claimed'],
61+
site_data=sites_info,
62+
query_notify=query_notify,
63+
)[site]['status']
64+
65+
if not hasattr(result, 'status'):
66+
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
67+
if type(result.status) is not QueryStatus: # type: ignore
68+
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
69+
status = result.status # type: ignore
70+
71+
return status
72+
73+
@pytest.mark.validate_targets
74+
@pytest.mark.online
75+
class Test_All_Targets:
76+
77+
@pytest.mark.validate_targets_fp
78+
def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
79+
"""Iterate through all sites in the manifest to discover possible false-positive inducting targets."""
80+
pattern: str
81+
for site in chunked_sites:
82+
try:
83+
pattern = chunked_sites[site]['regexCheck']
84+
except KeyError:
85+
pattern = FALSE_POSITIVE_DEFAULT_PATTERN
86+
87+
if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0:
88+
pattern = set_pattern_upper_bound(pattern)
89+
90+
result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
91+
assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
92+
93+
@pytest.mark.validate_targets_fn
94+
def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]):
95+
"""Iterate through all sites in the manifest to discover possible false-negative inducting targets."""
96+
for site in chunked_sites:
97+
result: QueryStatus = false_negative_check(chunked_sites, site)
98+
assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}"
99+

tox.ini

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@ envlist =
77
py312
88
py311
99
py310
10-
py39
11-
py38
1210

1311
[testenv]
1412
description = Attempt to build and install the package
1513
deps =
1614
coverage
1715
jsonschema
1816
pytest
17+
rstr
1918
allowlist_externals = coverage
2019
commands =
2120
coverage run --source=sherlock_project --module pytest -v
@@ -37,7 +36,7 @@ commands =
3736

3837
[gh-actions]
3938
python =
39+
3.13: py313
4040
3.12: py312
4141
3.11: py311
4242
3.10: py310
43-
3.9: py39

0 commit comments

Comments
 (0)