Skip to content

Commit 1c65d5f

Browse files
authored
feat(security): add package name typosquatting detection (#1059)
Signed-off-by: Amine <[email protected]>
1 parent 256fd0c commit 1c65d5f

File tree

8 files changed

+5412
-1
lines changed

8 files changed

+5412
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,4 @@ docs/_build
181181
bin/
182182
requirements.txt
183183
.macaron_env_file
184+
**/.DS_Store

src/macaron/config/defaults.ini

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,3 +600,14 @@ major_threshold = 20
600600
epoch_threshold = 3
601601
# The number of days +/- the day of publish the calendar versioning day may be.
602602
day_publish_error = 4
603+
604+
# The threshold ratio for two packages to be considered similar.
605+
distance_ratio_threshold = 0.95
606+
# The Keyboard cost for two characters that are close to each other on the keyboard.
607+
keyboard = 0.8
608+
# The scaling factor for the jaro winkler distance.
609+
scaling = 0.15
610+
# The cost for two characters that are not close to each other on the keyboard.
611+
cost = 1.0
612+
# The path to the file that contains the list of popular packages.
613+
popular_packages_path =

src/macaron/malware_analyzer/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
5252
- **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`.
5353
- **Dependency**: Will be run if the One Release heuristic fails.
5454

55+
10. **Typosquatting Presence**
56+
- **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting.
57+
- **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`.
58+
- **Dependency**: None.
59+
5560
### Contributing
5661

5762
When contributing an analyzer, it must meet the following requirements:
@@ -64,7 +69,7 @@ When contributing an analyzer, it must meet the following requirements:
6469
- Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated.
6570
- Assign a rule ID to the rule. This will be used to backtrack to determine if it was triggered.
6671
- Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details.
67-
- If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples).
72+
- If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples).
6873

6974
### Confidence Score Motivation
7075

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ class Heuristics(str, Enum):
3737
#: Indicates that the package has an unusually large version number for a single release.
3838
ANOMALOUS_VERSION = "anomalous_version"
3939

40+
#: Indicates that the package name is similar to a popular package.
41+
TYPOSQUATTING_PRESENCE = "typosquatting_presence"
42+
4043

4144
class HeuristicResult(str, Enum):
4245
"""Result type indicating the outcome of a heuristic."""
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Analyzer checks if there is typosquatting presence in the package name."""
5+
import logging
6+
import os
7+
8+
from macaron import MACARON_PATH
9+
from macaron.config.defaults import defaults
10+
from macaron.errors import HeuristicAnalyzerValueError
11+
from macaron.json_tools import JsonType
12+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
13+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
14+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer):
20+
"""Check whether the PyPI package has typosquatting presence."""
21+
22+
KEYBOARD_LAYOUT = {
23+
"1": (0, 0),
24+
"2": (0, 1),
25+
"3": (0, 2),
26+
"4": (0, 3),
27+
"5": (0, 4),
28+
"6": (0, 5),
29+
"7": (0, 6),
30+
"8": (0, 7),
31+
"9": (0, 8),
32+
"0": (0, 9),
33+
"-": (0, 10),
34+
"q": (1, 0),
35+
"w": (1, 1),
36+
"e": (1, 2),
37+
"r": (1, 3),
38+
"t": (1, 4),
39+
"y": (1, 5),
40+
"u": (1, 6),
41+
"i": (1, 7),
42+
"o": (1, 8),
43+
"p": (1, 9),
44+
"a": (2, 0),
45+
"s": (2, 1),
46+
"d": (2, 2),
47+
"f": (2, 3),
48+
"g": (2, 4),
49+
"h": (2, 5),
50+
"j": (2, 6),
51+
"k": (2, 7),
52+
"l": (2, 8),
53+
"z": (3, 0),
54+
"x": (3, 1),
55+
"c": (3, 2),
56+
"v": (3, 3),
57+
"b": (3, 4),
58+
"n": (3, 5),
59+
"m": (3, 6),
60+
}
61+
62+
def __init__(self, popular_packages_path: str | None = None) -> None:
63+
super().__init__(
64+
name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None
65+
)
66+
self.default_path = os.path.join(MACARON_PATH, "resources/popular_packages.txt")
67+
if popular_packages_path:
68+
self.default_path = popular_packages_path
69+
self.popular_packages, self.distance_ratio_threshold, self.keyboard, self.scaling, self.cost = (
70+
self._load_defaults()
71+
)
72+
73+
def _load_defaults(self) -> tuple[list[str], float, float, float, float]:
74+
"""Load default settings from defaults.ini.
75+
76+
Returns
77+
-------
78+
tuple[list[str], float, float, float, float]:
79+
The popular packages list, distance ratio threshold,
80+
keyboard awareness factor, scaling factor, and cost factor.
81+
"""
82+
section_name = "heuristic.pypi"
83+
path = self.default_path
84+
distance_ratio_threshold = 0.95
85+
keyboard = 0.8
86+
scaling = 0.15
87+
cost = 1.0
88+
89+
if defaults.has_section(section_name):
90+
section = defaults[section_name]
91+
path_from_config = section.get("popular_packages_path", self.default_path)
92+
# Fall back to default if the path in defaults.ini is empty.
93+
if path_from_config.strip():
94+
path = path_from_config
95+
distance_ratio_threshold = section.getfloat("distance_ratio_threshold", 0.95)
96+
keyboard = section.getfloat("keyboard", 0.8)
97+
scaling = section.getfloat("scaling", 0.15)
98+
cost = section.getfloat("cost", 1.0)
99+
100+
if not path or not os.path.exists(path):
101+
error_message = "Popular packages file not found or path not configured"
102+
logger.debug(error_message)
103+
raise HeuristicAnalyzerValueError(error_message)
104+
105+
popular_packages_list = []
106+
try:
107+
with open(path, encoding="utf-8") as file:
108+
popular_packages_list = file.read().splitlines()
109+
except OSError as error:
110+
error_message = "Could not read popular packages file"
111+
logger.debug(error_message)
112+
raise HeuristicAnalyzerValueError(error_message) from error
113+
114+
return (
115+
popular_packages_list,
116+
distance_ratio_threshold,
117+
keyboard,
118+
scaling,
119+
cost,
120+
)
121+
122+
def are_neighbors(self, first_char: str, second_char: str) -> bool:
123+
"""Check if two characters are adjacent on a QWERTY keyboard.
124+
125+
Adjacent characters are those that are next to each other
126+
either horizontally, vertically, or diagonally.
127+
128+
Parameters
129+
----------
130+
first_char : str
131+
The first character.
132+
second_char : str
133+
The second character.
134+
135+
Returns
136+
-------
137+
bool
138+
True if the characters are neighbors, False otherwise.
139+
"""
140+
coordinates1 = self.KEYBOARD_LAYOUT.get(first_char)
141+
coordinates2 = self.KEYBOARD_LAYOUT.get(second_char)
142+
if not coordinates1 or not coordinates2:
143+
return False
144+
return (abs(coordinates1[0] - coordinates2[0]) <= 1) and (abs(coordinates1[1] - coordinates2[1]) <= 1)
145+
146+
def substitution_func(self, first_char: str, second_char: str) -> float:
147+
"""Calculate the substitution cost between two characters.
148+
149+
Parameters
150+
----------
151+
first_char : str
152+
The first character.
153+
second_char : str
154+
The second character.
155+
156+
Returns
157+
-------
158+
float
159+
0.0 if the characters are the same, `self.keyboard` if they are
160+
neighbors on a QWERTY keyboard, otherwise `self.cost` .
161+
"""
162+
if first_char == second_char:
163+
return 0.0
164+
if self.keyboard and self.are_neighbors(first_char, second_char):
165+
return self.keyboard
166+
return self.cost
167+
168+
def jaro_distance(self, package_name: str, popular_package_name: str) -> float:
169+
"""Calculate the Jaro distance between two package names.
170+
171+
Parameters
172+
----------
173+
package_name : str
174+
The name of the package being analyzed.
175+
popular_package_name : str
176+
The name of a popular package to compare against.
177+
178+
Returns
179+
-------
180+
float
181+
The Jaro distance between the two package names.
182+
"""
183+
if package_name == popular_package_name:
184+
return 1.0
185+
186+
package_name_len = len(package_name)
187+
popular_package_name_len = len(popular_package_name)
188+
if package_name_len == 0 or popular_package_name_len == 0:
189+
return 0.0
190+
191+
match_distance = max(package_name_len, popular_package_name_len) // 2 - 1
192+
193+
package_name_matches = [False] * package_name_len
194+
popular_package_name_matches = [False] * popular_package_name_len
195+
matches = 0
196+
transpositions = 0.0 # A float to handle partial costs.
197+
198+
# Count matches.
199+
for first_index in range(package_name_len):
200+
start = max(0, first_index - match_distance)
201+
end = min(first_index + match_distance + 1, popular_package_name_len)
202+
for second_index in range(start, end):
203+
if popular_package_name_matches[second_index]:
204+
continue
205+
if package_name[first_index] == popular_package_name[second_index]:
206+
package_name_matches[first_index] = True
207+
popular_package_name_matches[second_index] = True
208+
matches += 1
209+
break
210+
211+
if matches == 0:
212+
return 0.0
213+
214+
# Count transpositions with possible keyboard awareness.
215+
k = 0
216+
for index in range(package_name_len):
217+
if package_name_matches[index]:
218+
while not popular_package_name_matches[k]:
219+
k += 1
220+
if package_name[index] != popular_package_name[k]:
221+
transpositions += self.substitution_func(package_name[index], popular_package_name[k])
222+
k += 1
223+
224+
transpositions /= 2.0 # Adjust for transpositions being counted twice.
225+
226+
return (
227+
matches / package_name_len + matches / popular_package_name_len + (matches - transpositions) / matches
228+
) / 3.0
229+
230+
def ratio(self, package_name: str, popular_package_name: str) -> float:
231+
"""Calculate the Jaro-Winkler distance ratio.
232+
233+
Parameters
234+
----------
235+
package_name : str
236+
The name of the package being analyzed.
237+
popular_package_name : str
238+
The name of a popular package to compare against.
239+
240+
Returns
241+
-------
242+
float
243+
The Jaro-Winkler distance ratio, incorporating a prefix bonus
244+
for common initial characters.
245+
"""
246+
scaling = self.scaling
247+
jaro_dist = self.jaro_distance(package_name, popular_package_name)
248+
prefix_length = 0
249+
max_prefix = 4
250+
for i in range(min(max_prefix, len(package_name), len(popular_package_name))):
251+
if package_name[i] == popular_package_name[i]:
252+
prefix_length += 1
253+
else:
254+
break
255+
256+
return jaro_dist + prefix_length * scaling * (1 - jaro_dist)
257+
258+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
259+
"""Analyze the package.
260+
261+
Parameters
262+
----------
263+
pypi_package_json: PyPIPackageJsonAsset
264+
The PyPI package JSON asset object.
265+
266+
Returns
267+
-------
268+
tuple[HeuristicResult, dict[str, JsonType]]:
269+
The result and related information collected during the analysis.
270+
"""
271+
if not self.popular_packages:
272+
warning_message = "Popular packages file is empty"
273+
logger.warning(warning_message)
274+
return HeuristicResult.SKIP, {"warning": warning_message}
275+
276+
package_name = pypi_package_json.component_name
277+
for popular_package in self.popular_packages:
278+
# If there is a popular packages file, check if the package name is similar to any of them.
279+
if package_name == popular_package:
280+
return HeuristicResult.PASS, {"package_name": package_name}
281+
282+
distance_ratio = self.ratio(package_name, popular_package)
283+
if distance_ratio >= self.distance_ratio_threshold:
284+
logger.info(
285+
"Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)",
286+
package_name,
287+
popular_package,
288+
distance_ratio,
289+
)
290+
return HeuristicResult.FAIL, {
291+
"package_name": package_name,
292+
"popular_package": popular_package,
293+
"similarity_ratio": distance_ratio,
294+
}
295+
296+
return HeuristicResult.PASS, {"package_name": package_name}

0 commit comments

Comments
 (0)