Skip to content

Commit 067eb6d

Browse files
authored
chore: fix a bug in SimilarProjectAnalyzer (#1152)
This PR addresses a bug in the SimilarProjectAnalyzer where exceptions were not properly handled. Additionally, it improves efficiency by ensuring that the SIMILAR_PROJECTS heuristic only runs when most other heuristics have failed. Signed-off-by: behnazh-w <[email protected]>
1 parent 9b05632 commit 067eb6d

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,18 @@ def __init__(self) -> None:
2424
super().__init__(
2525
name="similar_project_analyzer",
2626
heuristic=Heuristics.SIMILAR_PROJECTS,
27-
depends_on=None,
27+
depends_on=[
28+
(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL),
29+
(Heuristics.ONE_RELEASE, HeuristicResult.FAIL),
30+
(Heuristics.HIGH_RELEASE_FREQUENCY, HeuristicResult.FAIL),
31+
(Heuristics.UNCHANGED_RELEASE, HeuristicResult.FAIL),
32+
(Heuristics.CLOSER_RELEASE_JOIN_DATE, HeuristicResult.FAIL),
33+
(Heuristics.SUSPICIOUS_SETUP, HeuristicResult.FAIL),
34+
(Heuristics.WHEEL_ABSENCE, HeuristicResult.FAIL),
35+
(Heuristics.ANOMALOUS_VERSION, HeuristicResult.FAIL),
36+
(Heuristics.TYPOSQUATTING_PRESENCE, HeuristicResult.FAIL),
37+
(Heuristics.FAKE_EMAIL, HeuristicResult.FAIL),
38+
],
2839
)
2940

3041
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
@@ -106,6 +117,9 @@ def get_structure(self, package_name: str) -> list[str]:
106117
list[str]:
107118
The list of files in the package's sdist.
108119
"""
120+
# TODO: We should not download the source distributions for every package.
121+
# This is very inefficient. We should find a different way to extract the package
122+
# structure, e.g., the inspector service?
109123
sdist_url = self.get_url(package_name)
110124
if not sdist_url:
111125
logger.debug("Package %s does not have a sdist.", package_name)
@@ -117,10 +131,16 @@ def get_structure(self, package_name: str) -> list[str]:
117131
return []
118132

119133
buffer = io.BytesIO(response.content)
120-
with tarfile.open(fileobj=buffer, mode="r:gz") as tf:
121-
members = [
122-
member.name for member in tf.getmembers() if member.name and not member.name.startswith("PAXHeaders/")
123-
]
134+
try:
135+
with tarfile.open(fileobj=buffer, mode="r:gz") as tf:
136+
members = [
137+
member.name
138+
for member in tf.getmembers()
139+
if member.name and not member.name.startswith("PAXHeaders/")
140+
]
141+
except (tarfile.TarError, OSError) as error:
142+
logger.debug("Error reading source code tar file: %s", error)
143+
return []
124144

125145
return members
126146

0 commit comments

Comments
 (0)