99
1010import os
1111import logging
12+ from collections import Counter
1213from pathlib import Path
1314
1415from commoncode import fileutils
@@ -137,6 +138,7 @@ def parse(cls, location):
137138 debian_data = get_paragraph_data_from_file (location = location ),
138139 datasource_id = cls .datasource_id ,
139140 package_type = cls .default_package_type ,
141+ distro = 'debian' ,
140142 )
141143
142144 @classmethod
@@ -157,15 +159,19 @@ class DebianControlFileInSourceHandler(models.DatafileHandler):
157159
158160 @classmethod
159161 def parse (cls , location ):
160- # TODO: we cannot know the distro from the name only
161162 # NOTE: a control file in a source repo or debina.tar tarball can contain more than one package
163+ debian_packages = []
162164 for debian_data in get_paragraphs_data_from_file (location = location ):
163- yield build_package_data (
164- debian_data ,
165- datasource_id = cls .datasource_id ,
166- package_type = cls .default_package_type ,
165+ debian_packages .append (
166+ build_package_data (
167+ debian_data = debian_data ,
168+ datasource_id = cls .datasource_id ,
169+ package_type = cls .default_package_type ,
170+ )
167171 )
168172
173+ yield from populate_debian_namespace (debian_packages )
174+
169175 @classmethod
170176 def assign_package_to_resources (cls , package , resource , codebase , package_adder ):
171177 # two levels up
@@ -191,11 +197,19 @@ def parse(cls, location):
191197 location = location ,
192198 remove_pgp_signature = True ,
193199 )
194- yield build_package_data (
200+
201+ package_data_from_file = build_package_data_from_package_filename (
202+ filename = os .path .basename (location ),
203+ datasource_id = cls .datasource_id ,
204+ package_type = cls .default_package_type ,
205+ )
206+ package_data = build_package_data (
195207 debian_data = debian_data ,
196208 datasource_id = cls .datasource_id ,
197209 package_type = cls .default_package_type ,
198210 )
211+ package_data .update_purl_fields (package_data = package_data_from_file )
212+ yield package_data
199213
200214 @classmethod
201215 def assign_package_to_resources (cls , package , resource , codebase , package_adder ):
@@ -214,13 +228,18 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
214228 def parse (cls , location ):
215229 # note that we do not know yet the distro at this stage
216230 # we could get it... but we get that later during assemble()
217- for debian_data in get_paragraphs_data_from_file (location ):
218- yield build_package_data (
219- debian_data ,
220- datasource_id = cls .datasource_id ,
221- package_type = cls .default_package_type ,
231+ debian_packages = []
232+ for debian_data in get_paragraphs_data_from_file (location = location ):
233+ debian_packages .append (
234+ build_package_data (
235+ debian_data = debian_data ,
236+ datasource_id = cls .datasource_id ,
237+ package_type = cls .default_package_type ,
238+ )
222239 )
223240
241+ yield from populate_debian_namespace (debian_packages )
242+
224243 @classmethod
225244 def assemble (cls , package_data , resource , codebase , package_adder ):
226245 # get the root resource of the rootfs
@@ -260,7 +279,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
260279
261280 # We only need to adjust the md5sum/list path in the case of `same`
262281 qualifiers = package_data .qualifiers or {}
263- architecture = qualifiers .get ('architecture ' )
282+ architecture = qualifiers .get ('arch ' )
264283
265284 multi_arch = package_data .extra_data .get ('multi_arch' )
266285
@@ -305,6 +324,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
305324 package .update (
306325 package_data = package_data ,
307326 datafile_path = res .path ,
327+ check_compatible = False ,
308328 replace = False ,
309329 include_version = False ,
310330 include_qualifiers = False ,
@@ -379,14 +399,18 @@ def parse(cls, location):
379399 rootfs installation. distroless is derived from Debian but each package
380400 has its own status file.
381401 """
382- for debian_data in get_paragraphs_data_from_file (location ):
383- yield build_package_data (
384- debian_data ,
385- datasource_id = cls .datasource_id ,
386- package_type = cls .default_package_type ,
387- distro = 'distroless' ,
402+ debian_packages = []
403+ for debian_data in get_paragraphs_data_from_file (location = location ):
404+ debian_packages .append (
405+ build_package_data (
406+ debian_data = debian_data ,
407+ datasource_id = cls .datasource_id ,
408+ package_type = cls .default_package_type ,
409+ )
388410 )
389411
412+ yield from populate_debian_namespace (debian_packages )
413+
390414 @classmethod
391415 def assemble (cls , package_data , resource , codebase , package_adder ):
392416 # get the root resource of the rootfs
@@ -523,6 +547,9 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
523547 """
524548
525549 # TODO: we cannot know the distro from the name only
550+ # PURLs without namespace is invalid, so we need to
551+ # have a default value for this
552+ distro = 'debian'
526553 deb = DebArchive .from_filename (filename = filename )
527554
528555 if deb .architecture :
@@ -538,6 +565,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
538565 datasource_id = datasource_id ,
539566 type = package_type ,
540567 name = deb .name ,
568+ namespace = distro ,
541569 version = version ,
542570 qualifiers = qualifiers ,
543571 )
@@ -598,7 +626,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
598626 qualifiers = {}
599627 architecture = debian_data .get ('architecture' )
600628 if architecture :
601- qualifiers ['architecture ' ] = architecture
629+ qualifiers ['arch ' ] = architecture
602630
603631 extra_data = {}
604632 # Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or
@@ -628,13 +656,27 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
628656 if keyword :
629657 keywords .append (keyword )
630658
659+ # Get distro/namespace information from clues in package data
660+ if not distro :
661+ if version :
662+ for clue , namespace in version_clues_for_namespace .items ():
663+ if clue in version :
664+ distro = namespace
665+ break
666+
667+ if maintainer :
668+ for clue , namespace in maintainer_clues_for_namespace .items ():
669+ if clue in maintainer :
670+ distro = namespace
671+ break
672+
631673 source_packages = []
632674 source = debian_data .get ('source' )
633675 if source :
634676 source_pkg_purl = PackageURL (
635677 type = package_type ,
636678 name = source ,
637- namespace = distro
679+ namespace = distro ,
638680 ).to_string ()
639681
640682 source_packages .append (source_pkg_purl )
@@ -656,6 +698,46 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
656698 )
657699
658700
701+ def populate_debian_namespace (packages ):
702+ """
703+ For an iterable of debian `packages`, populate the
704+ most frequently occuring namespace, or the default
705+ namespace 'debian' in packages without namespace.
706+ """
707+ if not packages :
708+ return
709+
710+ namespaces_with_count = Counter ([
711+ package .namespace
712+ for package in packages
713+ ])
714+ distro = max (namespaces_with_count , key = namespaces_with_count .get )
715+ if not distro :
716+ distro = 'debian'
717+
718+ for package in packages :
719+ if not package .namespace :
720+ package .namespace = distro
721+ yield package
722+
723+
724+ version_clues_for_namespace = {
725+ 'deb' : 'debian' ,
726+ 'ubuntu' : 'ubuntu' ,
727+ }
728+
729+
730+ maintainer_clues_for_namespace = {
731+ 'packages.debian.org' : 'debian' ,
732+ 'lists.debian.org' : 'debian' ,
733+ 'lists.alioth.debian.org' : 'debian' ,
734+ '@debian.org' : 'debian' ,
735+ 'debian-init-diversity@' : 'debian' ,
736+ 'lists.ubuntu.com' : 'ubuntu' ,
737+ '@canonical.com' : 'ubuntu' ,
738+ }
739+
740+
659741ignored_root_dirs = {
660742 '/.' ,
661743 '/bin' ,
0 commit comments