44"""This analyzer checks if the package has a similar structure to other packages maintained by the same user."""
55
66import hashlib
7- import io
87import logging
9- import tarfile
108
119from macaron .json_tools import JsonType
1210from macaron .malware_analyzer .pypi_heuristics .base_analyzer import BaseHeuristicAnalyzer
1311from macaron .malware_analyzer .pypi_heuristics .heuristics import HeuristicResult , Heuristics
14- from macaron .slsa_analyzer .package_registry .pypi_registry import PyPIPackageJsonAsset
15- from macaron .util import send_get_http , send_get_http_raw
12+ from macaron .slsa_analyzer .package_registry .pypi_registry import PyPIInspectorAsset , PyPIPackageJsonAsset
1613
1714logger : logging .Logger = logging .getLogger (__name__ )
1815
@@ -24,20 +21,7 @@ def __init__(self) -> None:
2421 super ().__init__ (
2522 name = "similar_project_analyzer" ,
2623 heuristic = Heuristics .SIMILAR_PROJECTS ,
27- # TODO: these dependencies are used as this heuristic currently downloads many package sourcecode
28- # tarballs. Refactoring this heuristic to run more efficiently means this should have depends_on=None.
29- depends_on = [
30- (Heuristics .EMPTY_PROJECT_LINK , HeuristicResult .FAIL ),
31- (Heuristics .ONE_RELEASE , HeuristicResult .FAIL ),
32- (Heuristics .HIGH_RELEASE_FREQUENCY , HeuristicResult .FAIL ),
33- (Heuristics .UNCHANGED_RELEASE , HeuristicResult .FAIL ),
34- (Heuristics .CLOSER_RELEASE_JOIN_DATE , HeuristicResult .FAIL ),
35- (Heuristics .SUSPICIOUS_SETUP , HeuristicResult .FAIL ),
36- (Heuristics .WHEEL_ABSENCE , HeuristicResult .FAIL ),
37- (Heuristics .ANOMALOUS_VERSION , HeuristicResult .FAIL ),
38- (Heuristics .TYPOSQUATTING_PRESENCE , HeuristicResult .FAIL ),
39- (Heuristics .FAKE_EMAIL , HeuristicResult .FAIL ),
40- ],
24+ depends_on = None ,
4125 )
4226
4327 def analyze (self , pypi_package_json : PyPIPackageJsonAsset ) -> tuple [HeuristicResult , dict [str , JsonType ]]:
@@ -58,112 +42,127 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
5842 HeuristicAnalyzerValueError
5943 if the analysis fails.
6044 """
61- package_name = pypi_package_json .component_name
62- target_hash = self .get_structure_hash (package_name )
63- if not target_hash :
45+ target_structure = self .get_normalized_structure (pypi_package_json )
46+ if not target_structure :
6447 return HeuristicResult .SKIP , {}
48+ target_hash = hashlib .sha256 ("\n " .join (target_structure ).encode ("utf-8" )).hexdigest ()
49+ detail_info : dict = {}
50+ similar_projects : list [str ] = []
51+ result : HeuristicResult = HeuristicResult .PASS
52+
53+ maintainers = pypi_package_json .pypi_registry .get_maintainers_of_package (pypi_package_json .component_name )
54+ if not maintainers :
55+ # NOTE: This would ideally raise an error, identifying malformed package information, but issues with
56+ # obtaining maintainer information from the HTML page means this will remains as a SKIP for now.
57+ return HeuristicResult .SKIP , {}
58+
59+ analyzed : set [str ] = {pypi_package_json .component_name }
6560
66- maintainers = pypi_package_json .pypi_registry .get_maintainers_of_package (package_name )
67- if maintainers :
68- for maintainer in maintainers :
69- maintainer_packages = pypi_package_json .pypi_registry .get_packages_by_username (maintainer )
70- if not maintainer_packages :
61+ for maintainer in maintainers :
62+ maintainer_packages = pypi_package_json .pypi_registry .get_packages_by_username (maintainer )
63+ if not maintainer_packages :
64+ continue
65+ for package in maintainer_packages :
66+ # skip if it is a package we have already analyzed
67+ if package in analyzed :
7168 continue
72- for package in maintainer_packages :
73- if package == package_name :
74- continue
69+ analyzed .add (package )
7570
76- hash_value = self .get_structure_hash (package )
77- if target_hash == hash_value :
78- return HeuristicResult .FAIL , {
79- "message" : f"The package { package_name } has a similar structure to { package } ." ,
80- "similar_package" : package ,
81- }
71+ adjacent_pypi_json = PyPIPackageJsonAsset (
72+ package , None , False , pypi_package_json .pypi_registry , {}, "" , PyPIInspectorAsset ("" , [], {})
73+ )
74+ if not adjacent_pypi_json .download ("" ):
75+ continue
76+ structure = self .get_normalized_structure (adjacent_pypi_json )
77+ if not structure :
78+ continue
8279
83- return HeuristicResult .PASS , {}
80+ hash_value = hashlib .sha256 ("\n " .join (structure ).encode ("utf-8" )).hexdigest ()
81+ if target_hash == hash_value :
82+ similar_projects .append (package )
8483
85- def get_url (self , package_name : str , package_type : str = "sdist" ) -> str | None :
86- """Get the URL of the package's sdist.
84+ detail_info ["similar_projects" ] = similar_projects
85+ if similar_projects :
86+ result = HeuristicResult .FAIL
8787
88- Parameters
89- ----------
90- package_name : str
91- The name of the package.
92- package_type: str
93- The package type to retrieve the URL of.
88+ return result , detail_info
9489
95- Returns
96- -------
97- str | None:
98- The URL of the package's sdist or None if not found.
99- """
100- json_url = f"https://pypi.org/pypi/{ package_name } /json"
101- data = send_get_http (json_url , headers = {})
102- if not data :
103- logger .debug ("Failed to fetch package data for %s." , package_name )
104- return None
105-
106- sdist = next ((url for url in data ["urls" ] if url ["packagetype" ] == package_type and url .get ("url" )), None )
107- return sdist ["url" ] if sdist else None
90+ def get_normalized_structure (self , pypi_package_json : PyPIPackageJsonAsset ) -> set [str ] | None :
91+ """Extract a normalized structure for a package.
10892
109- def get_structure ( self , package_name : str ) -> list [ str ]:
110- """Get the file structure of the package's sdist .
93+ The normalized structure is the file tree structure of all python file in the package, with the package's
94+ name removed, so it is comparable .
11195
11296 Parameters
11397 ----------
114- package_name : str
115- The name of the package .
98+ pypi_package_json: PyPIPackageJsonAsset
99+ The PyPI package JSON asset object .
116100
117101 Returns
118102 -------
119- list [str]:
120- The list of files in the package's sdist .
103+ set [str] | None :
104+ The normalized structure of file paths in a set, or None if a problem was encountered .
121105 """
122- # TODO: We should not download the source distributions for every package.
123- # This is very inefficient. We should find a different way to extract the package
124- # structure, e.g., the inspector service?
125- sdist_url = self .get_url (package_name )
126- if not sdist_url :
127- logger .debug ("Package %s does not have a sdist." , package_name )
128- return []
129-
130- response = send_get_http_raw (sdist_url )
131- if not response :
132- logger .debug ("Failed to download sdist for package %s." , package_name )
133- return []
134-
135- buffer = io .BytesIO (response .content )
136- try :
137- with tarfile .open (fileobj = buffer , mode = "r:gz" ) as tf :
138- members = [
139- member .name
140- for member in tf .getmembers ()
141- if member .name and not member .name .startswith ("PAXHeaders/" )
142- ]
143- except (tarfile .TarError , OSError ) as error :
144- logger .debug ("Error reading source code tar file: %s" , error )
145- return []
146-
147- return members
148-
149- def get_structure_hash (self , package_name : str ) -> str :
150- """Get the hash of the package's file structure.
106+ if not pypi_package_json .get_inspector_links ():
107+ return None
151108
152- Parameters
153- ----------
154- package_name : str
155- The name of the package.
109+ # for normalizing the structure
110+ version = pypi_package_json .component_version
111+ if version is None :
112+ version = pypi_package_json .get_latest_version ()
113+ if version is None :
114+ return None
156115
157- Returns
158- -------
159- str:
160- The hash of the package's file structure.
161- """
162- structure = self .get_structure (package_name )
163- if not structure :
164- return ""
116+ prefix = "./" + pypi_package_json .component_name + "-" + version
117+ normalized_structure = set ()
118+
119+ # try using the tarball first
120+ tarball_link = pypi_package_json .inspector_asset .package_sdist_link
121+ if tarball_link and pypi_package_json .inspector_asset .package_link_reachability [tarball_link ]:
122+ # all files are always prefixed with ./<package_name>-<version>/<...> in tarballs
123+ # non-metadaata files then have <package_name>/
124+ # prefix += "/" + pypi_package_json.component_name + "/"
125+ structure = PyPIInspectorAsset .get_structure (tarball_link )
126+ if structure :
127+ for file_path in structure :
128+ # we only consider python files. This avoids considering always package-specific files like PKG_INFO, licenses,
129+ # build metadata, etc.
130+ if file_path [- 3 :] != ".py" :
131+ continue
132+
133+ # remove the "/package_name" from the prefix as well, that way the structure between two packages with different
134+ # names will be the same
135+ normalized_structure .add (
136+ file_path .removeprefix (prefix ).removeprefix ("/" + pypi_package_json .component_name )
137+ )
138+
139+ # We can't compare against wheel structures if we keep setup.py in there
140+ normalized_structure .discard ("/setup.py" )
141+ return normalized_structure
142+
143+ wheel_links = pypi_package_json .inspector_asset .package_whl_links
144+ if len (wheel_links ) > 0 :
145+ # wheels have this extra field for package metadata
146+ prefix += ".dist-info/"
147+ # structure is generally going to be the same, platform-specific details may vary for pacakges
148+ # which have platform-specific wheels
149+ structure = PyPIInspectorAsset .get_structure (wheel_links [0 ])
150+ if structure :
151+ for file_path in structure :
152+ # the .dist-info stuff is usually metadata
153+ if file_path .startswith (prefix ) or file_path [- 3 :] != ".py" :
154+ continue
155+
156+ # remove the "./package_name" from the prefix as well, that way the structure between
157+ # two packages with different names will be the same
158+ normalized_structure .add (
159+ file_path .removeprefix (pypi_package_json .component_name + "/" ).removeprefix (
160+ "./" + pypi_package_json .component_name
161+ )
162+ )
165163
166- normalized = sorted ([ p . replace ( package_name , "<ROOT>" ) for p in structure ])
164+ return normalized_structure
167165
168- joined = "\n " .join (normalized ).encode ("utf-8" )
169- return hashlib .sha256 (joined ).hexdigest ()
166+ # doesn't have wheel or tarball links even made, so shouldn't get here if the first line of this
167+ # function worked.
168+ return None
0 commit comments