@@ -119,29 +119,74 @@ def index_package_files(package, scan_data, reindex=False):
119119 return scan_index_errors
120120
121121
122- def index_package (
123- scannable_uri , package , scan_data , summary_data , project_extra_data , reindex = False
124- ):
122+ def update_package_relationships (package , existing_package ):
123+ """
124+ Update the relations of `existing_package` to point at `package`
125+ """
126+ existing_package .approximatedirectorycontentindex_set .update (package = package )
127+ existing_package .approximatedirectorystructureindex_set .update (package = package )
128+ existing_package .approximateresourcecontentindex_set .update (package = package )
129+ existing_package .exactfileindex_set .update (package = package )
130+ existing_package .snippetindex_set .update (package = package )
131+ existing_package .stemmedsnippetindex_set .update (package = package )
132+ existing_package .resources .update (package = package )
133+ existing_package .is_duplicate = True
134+ existing_package .save ()
135+ package .is_duplicate = False
136+ package .save ()
137+
138+
139+ def check_for_duplicate_packages (package ):
140+ """
141+ Given a `package`, check to see if it has already been indexed already. If
142+ so, then check to see if `package` is a better candidate than the existing
143+ package for being the ultimate source for that package.
144+
145+ Return True if a duplicate package already exists and relations have been
146+ updated, otherwise return False.
147+ """
125148 from packagedb .models import Package
126149
150+ if not package .sha1 :
151+ return False
152+
153+ repo_types = [
154+ "maven" ,
155+ "pypi" ,
156+ "npm" ,
157+ "crate" ,
158+ ]
159+ git_repo_types = [
160+ "github" ,
161+ "gitlab" ,
162+ "bitbucket" ,
163+ ]
164+
127165 # Check for dupes
128- existing_packages = Package .objects .filter (sha1 = package .sha1 )
129- if existing_packages :
130- for existing_package in existing_packages :
131- # see if the package we are indexing is older than the package we have
132- if existing_package .package .release_date > package .release_date :
133- existing_package .approximatedirectorycontentindex_set .update (package = package )
134- existing_package .approximatedirectorystructureindex_set .update (package = package )
135- existing_package .approximateresourcecontentindex_set .update (package = package )
136- existing_package .exactfileindex_set .update (package = package )
137- existing_package .snippetindex_set .update (package = package )
138- existing_package .stemmedsnippetindex_set .update (package = package )
139- existing_package .resources .update (package = package )
140- existing_package .is_duplicate = True
141- existing_package .save ()
142- package .is_duplicate = False
143- package .save ()
144- return
166+ existing_packages = Package .objects .filter (sha1 = package .sha1 , is_duplicate = False )
167+ for existing_package in existing_packages :
168+ # see if the package we are indexing is older than the package we have
169+ # TODO: This will probably have to be a task
170+ if (
171+ (package .type in repo_types and existing_package .type not in repo_types )
172+ or (package .type in git_repo_types and package .type not in git_repo_types )
173+ or (
174+ (existing_package .release_date and package .release_date )
175+ and (existing_package .release_date > package .release_date )
176+ )
177+ ):
178+ update_package_relationships (
179+ package = package , existing_package = existing_package
180+ )
181+
182+ return bool (package .sha1 ) and bool (existing_packages )
183+
184+
185+ def index_package (
186+ scannable_uri , package , scan_data , summary_data , project_extra_data , reindex = False
187+ ):
188+ if check_for_duplicate_packages (package ):
189+ return
145190
146191 scan_index_errors = []
147192 try :
0 commit comments