Skip to content

Commit 0cd49a7

Browse files
authored
refactor: allow injection of corpus (#17725)
In preparation of obtaining the input list from other sources, allow for dependency injection of the needed input, but allow fallback to the static corpus for now. Signed-off-by: Mike Fiedler <[email protected]>
1 parent e0d3de3 commit 0cd49a7

File tree

2 files changed

+28
-27
lines changed

2 files changed

+28
-27
lines changed

tests/unit/packaging/test_typosnyper.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,15 @@
3131
("requestz", ("common_typos", "requests")),
3232
],
3333
)
34-
def test_typo_check_name(name, expected, monkeypatch):
35-
# Set known entries in the _TOP_PROJECT_NAMES list
36-
# TODO: Replace with a better way to generate corpus
37-
monkeypatch.setattr(
38-
"warehouse.packaging.typosnyper._TOP_PROJECT_NAMES",
39-
{
40-
"numpy",
41-
"requests",
42-
"sphinx",
43-
"beautifulsoup4",
44-
"jinja2",
45-
"python-dateutil",
46-
},
47-
)
34+
def test_typo_check_name(name, expected):
35+
# Set known entries corpus entries for testing
36+
test_names_corpus = {
37+
"numpy",
38+
"requests",
39+
"sphinx",
40+
"beautifulsoup4",
41+
"jinja2",
42+
"python-dateutil",
43+
}
4844

49-
assert typo_check_name(name) == expected
45+
assert typo_check_name(name, corpus=test_names_corpus) == expected

warehouse/packaging/typosnyper.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@
285285
}
286286

287287

288-
def _repeated_characters(project_name: str) -> TypoCheckMatch:
288+
def _repeated_characters(project_name: str, corpus: set[str]) -> TypoCheckMatch:
289289
"""
290290
Removes any identical consecutive characters to check for typosquatting
291291
by repeated characters.
@@ -301,13 +301,13 @@ def _repeated_characters(project_name: str) -> TypoCheckMatch:
301301
# Build a new name by removing the duplicated character
302302
deduplicated = project_name[:idx] + project_name[idx + 1 :]
303303
# If the new name is in the list of popular names, return it
304-
if deduplicated in _TOP_PROJECT_NAMES:
304+
if deduplicated in corpus:
305305
return "repeated_characters", deduplicated
306306

307307
return None
308308

309309

310-
def _omitted_characters(project_name: str) -> TypoCheckMatch:
310+
def _omitted_characters(project_name: str, corpus: set[str]) -> TypoCheckMatch:
311311
"""
312312
Inserts allowed characters into name to check for typosquatting by omission.
313313
For example, 'evnt-stream' could be typosquatting 'event-stream'.
@@ -330,13 +330,13 @@ def _omitted_characters(project_name: str) -> TypoCheckMatch:
330330
# Build new name by inserting the current character in the current position
331331
constructed = project_name[:idx] + character + project_name[idx:]
332332
# If the new name is in the list of popular names, return it
333-
if constructed in _TOP_PROJECT_NAMES:
333+
if constructed in corpus:
334334
return "omitted_characters", constructed
335335

336336
return None
337337

338338

339-
def _swapped_characters(project_name: str) -> TypoCheckMatch:
339+
def _swapped_characters(project_name: str, corpus: set[str]) -> TypoCheckMatch:
340340
"""
341341
Swaps adjacent characters to check for typosquatting by swapped characters.
342342
For example, 'spihnx' could be typosquatting 'sphinx'.
@@ -352,13 +352,13 @@ def _swapped_characters(project_name: str) -> TypoCheckMatch:
352352
swapped_string = "".join(char_list)
353353

354354
# If the new name is in the list of popular names, return it
355-
if swapped_string in _TOP_PROJECT_NAMES:
355+
if swapped_string in corpus:
356356
return "swapped_characters", swapped_string
357357

358358
return None
359359

360360

361-
def _swapped_words(project_name: str) -> TypoCheckMatch:
361+
def _swapped_words(project_name: str, corpus: set[str]) -> TypoCheckMatch:
362362
"""
363363
Reorders project_name substrings separated by `-` to look for typosquatting.
364364
For example, 'stream-event' could be squatting 'event-stream'.
@@ -381,13 +381,13 @@ def _swapped_words(project_name: str) -> TypoCheckMatch:
381381
# Join the words using `-` to create a new name
382382
reconstructed = "-".join(p)
383383
# If the new name is in the list of popular names, return it
384-
if reconstructed in _TOP_PROJECT_NAMES:
384+
if reconstructed in corpus:
385385
return "swapped_words", reconstructed
386386

387387
return None
388388

389389

390-
def _common_typos(project_name: str) -> TypoCheckMatch:
390+
def _common_typos(project_name: str, corpus: set[str]) -> TypoCheckMatch:
391391
"""
392392
Applies each of the common typos to each of the characters in the given name.
393393
Checks if each result is in the list of popular names.
@@ -404,25 +404,30 @@ def _common_typos(project_name: str) -> TypoCheckMatch:
404404
typo_project_name = "".join(typo_project_name_chars)
405405

406406
# Check if the new package name is in the list of popular packages
407-
if typo_project_name in _TOP_PROJECT_NAMES:
407+
if typo_project_name in corpus:
408408
return "common_typos", typo_project_name
409409

410410
return None
411411

412412

413-
def typo_check_name(project_name: str) -> TypoCheckMatch:
413+
def typo_check_name(project_name: str, corpus=None) -> TypoCheckMatch:
414414
"""
415415
Check if the given project name is a typo of another project name.
416416
417417
Runs multiple checks, and if any of them match, returns the matched name.
418418
"""
419+
if corpus is None:
420+
# Fall back to the static list if not provided
421+
corpus = _TOP_PROJECT_NAMES
422+
423+
# Run each check in order
419424
for check in (
420425
_repeated_characters,
421426
_omitted_characters,
422427
_swapped_characters,
423428
_swapped_words,
424429
_common_typos,
425430
):
426-
if result := check(project_name):
431+
if result := check(project_name, corpus=corpus):
427432
return result
428433
return None

0 commit comments

Comments
 (0)