285
285
}
286
286
287
287
288
- def _repeated_characters (project_name : str ) -> TypoCheckMatch :
288
+ def _repeated_characters (project_name : str , corpus : set [ str ] ) -> TypoCheckMatch :
289
289
"""
290
290
Removes any identical consecutive characters to check for typosquatting
291
291
by repeated characters.
@@ -301,13 +301,13 @@ def _repeated_characters(project_name: str) -> TypoCheckMatch:
301
301
# Build a new name by removing the duplicated character
302
302
deduplicated = project_name [:idx ] + project_name [idx + 1 :]
303
303
# If the new name is in the list of popular names, return it
304
- if deduplicated in _TOP_PROJECT_NAMES :
304
+ if deduplicated in corpus :
305
305
return "repeated_characters" , deduplicated
306
306
307
307
return None
308
308
309
309
310
- def _omitted_characters (project_name : str ) -> TypoCheckMatch :
310
+ def _omitted_characters (project_name : str , corpus : set [ str ] ) -> TypoCheckMatch :
311
311
"""
312
312
Inserts allowed characters into name to check for typosquatting by omission.
313
313
For example, 'evnt-stream' could be typosquatting 'event-stream'.
@@ -330,13 +330,13 @@ def _omitted_characters(project_name: str) -> TypoCheckMatch:
330
330
# Build new name by inserting the current character in the current position
331
331
constructed = project_name [:idx ] + character + project_name [idx :]
332
332
# If the new name is in the list of popular names, return it
333
- if constructed in _TOP_PROJECT_NAMES :
333
+ if constructed in corpus :
334
334
return "omitted_characters" , constructed
335
335
336
336
return None
337
337
338
338
339
- def _swapped_characters (project_name : str ) -> TypoCheckMatch :
339
+ def _swapped_characters (project_name : str , corpus : set [ str ] ) -> TypoCheckMatch :
340
340
"""
341
341
Swaps adjacent characters to check for typosquatting by swapped characters.
342
342
For example, 'spihnx' could be typosquatting 'sphinx'.
@@ -352,13 +352,13 @@ def _swapped_characters(project_name: str) -> TypoCheckMatch:
352
352
swapped_string = "" .join (char_list )
353
353
354
354
# If the new name is in the list of popular names, return it
355
- if swapped_string in _TOP_PROJECT_NAMES :
355
+ if swapped_string in corpus :
356
356
return "swapped_characters" , swapped_string
357
357
358
358
return None
359
359
360
360
361
- def _swapped_words (project_name : str ) -> TypoCheckMatch :
361
+ def _swapped_words (project_name : str , corpus : set [ str ] ) -> TypoCheckMatch :
362
362
"""
363
363
Reorders project_name substrings separated by `-` to look for typosquatting.
364
364
For example, 'stream-event' could be squatting 'event-stream'.
@@ -381,13 +381,13 @@ def _swapped_words(project_name: str) -> TypoCheckMatch:
381
381
# Join the words using `-` to create a new name
382
382
reconstructed = "-" .join (p )
383
383
# If the new name is in the list of popular names, return it
384
- if reconstructed in _TOP_PROJECT_NAMES :
384
+ if reconstructed in corpus :
385
385
return "swapped_words" , reconstructed
386
386
387
387
return None
388
388
389
389
390
- def _common_typos (project_name : str ) -> TypoCheckMatch :
390
+ def _common_typos (project_name : str , corpus : set [ str ] ) -> TypoCheckMatch :
391
391
"""
392
392
Applies each of the common typos to each of the characters in the given name.
393
393
Checks if each result is in the list of popular names.
@@ -404,25 +404,30 @@ def _common_typos(project_name: str) -> TypoCheckMatch:
404
404
typo_project_name = "" .join (typo_project_name_chars )
405
405
406
406
# Check if the new package name is in the list of popular packages
407
- if typo_project_name in _TOP_PROJECT_NAMES :
407
+ if typo_project_name in corpus :
408
408
return "common_typos" , typo_project_name
409
409
410
410
return None
411
411
412
412
413
- def typo_check_name (project_name : str ) -> TypoCheckMatch :
413
+ def typo_check_name (project_name : str , corpus = None ) -> TypoCheckMatch :
414
414
"""
415
415
Check if the given project name is a typo of another project name.
416
416
417
417
Runs multiple checks, and if any of them match, returns the matched name.
418
418
"""
419
+ if corpus is None :
420
+ # Fall back to the static list if not provided
421
+ corpus = _TOP_PROJECT_NAMES
422
+
423
+ # Run each check in order
419
424
for check in (
420
425
_repeated_characters ,
421
426
_omitted_characters ,
422
427
_swapped_characters ,
423
428
_swapped_words ,
424
429
_common_typos ,
425
430
):
426
- if result := check (project_name ):
431
+ if result := check (project_name , corpus = corpus ):
427
432
return result
428
433
return None
0 commit comments