2828which makes every filesystem performance suffer.
2929
3030In addition, when storing these files in Git repositories, we need to avoid creating any repository
31- with too many files that would make using this repository impactical or exceed the limits of some
31+ with too many files that would make using this repository impractical or exceed the limits of some
3232repository hosting services.
3333
3434Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
4646 """
4747 Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
4848 identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49- three segments composed of four letters and dihits each separated by a dash.
49+ three segments composed of four letters and digits each separated by a dash.
5050 For example::
5151 >>> import re
5252 >>> vcid = build_vcid()
5353 >>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
5454
5555 We were mistakenly not using enough bits. The symptom was that the last
56- segment of the VCID was always strting with "aaa" This ensure we are now OK:
56+ segment of the VCID was always string with "aaa" This ensure we are now OK:
5757 >>> vcids = [build_vcid() for _ in range(50)]
5858 >>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
5959 """
6060 uid = uuid4 ().bytes
61- # we keep three segments of 4 base32-encodee bytes, 3*4=12
61+ # we keep three segments of 4 base32-encoded bytes, 3*4=12
6262 # which corresponds to 60 bits
63- # becausee each base32 byte can store 5 bits (2**5 = 32)
63+ # because each base32 byte can store 5 bits (2**5 = 32)
6464 uid = base32_custom (uid )[:12 ].decode ("utf-8" ).lower ()
6565 return f"{ prefix } -{ uid [:4 ]} -{ uid [4 :8 ]} -{ uid [8 :12 ]} "
6666
@@ -72,7 +72,7 @@ def get_vcid_yml_file_path(vcid: str):
7272 return Path (VULNERABILITY_REPO_NAME ) / vulnerability_yml_path (vcid )
7373
7474
75- # This cuxstom 32 characters alphabet is designed to avoid visually easily confusable characters:
75+ # This custom 32 characters alphabet is designed to avoid visually easily confusable characters:
7676# i and l
7777# 0 and o
7878_base32_alphabet = b"abcdefghjkmnpqrstuvwxyz123456789"
@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
117117 Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
118118
119119 The approach is to distribute the files in many directories to avoid having too many files in
120- any directory and be able to find the path to a vulneravility file given its VCID distributed on
120+ any directory and be able to find the path to a vulnerability file given its VCID distributed on
121121 the first two characters of the UUID section of a VCID.
122122
123123 The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -140,9 +140,12 @@ def get_package_base_dir(purl: Union[PackageURL, str]):
140140 """
141141 Return the base path to a Package directory (ignoring version) for a purl
142142 """
143+ if isinstance (purl , str ):
144+ purl = PackageURL .from_string (purl )
145+
143146 path_elements = package_path_elements (purl )
144147 phash , core_path , _pversion , _extra_path = path_elements
145- return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ phash } " ) / core_path
148+ return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ purl . type } - { phash } " ) / core_path
146149
147150
148151def get_package_purls_yml_file_path (purl : Union [PackageURL , str ]):
@@ -159,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
159162 return get_package_base_dir (purl ) / VULNERABILITIES_FILENAME
160163
161164
165+ # We use a 4-tier system for storing package metadata.
166+ # The tiers are as follows:
167+ # 1. Super Large Ecosystem (~5M packages): 2^10 = 1,024 git repositories
168+ # 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
169+ # 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
170+ # 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
171+ # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
172+ BIT_COUNT_BY_ECOSYSTEM = {
173+ # Super Large Ecosystem
174+ "github" : 10 ,
175+ "npm" : 10 ,
176+ # Large Ecosystem
177+ "golang" : 7 ,
178+ "maven" : 7 ,
179+ "nuget" : 7 ,
180+ "perl" : 7 ,
181+ "php" : 7 ,
182+ "pypi" : 7 ,
183+ "ruby" : 7 ,
184+ # Medium Ecosystem
185+ "alpm" : 5 ,
186+ "bitbucket" : 5 ,
187+ "cocoapods" : 5 ,
188+ "composer" : 5 ,
189+ "deb" : 5 ,
190+ "docker" : 5 ,
191+ "gem" : 5 ,
192+ "generic" : 5 ,
193+ "huggingface" : 5 ,
194+ "mlflow" : 5 ,
195+ "pub" : 5 ,
196+ "rpm" : 5 ,
197+ # Small Ecosystem
198+ "bitnami" : 0 ,
199+ "cargo" : 0 ,
200+ "conan" : 0 ,
201+ "conda" : 0 ,
202+ "cpan" : 0 ,
203+ "cran" : 0 ,
204+ "hackage" : 0 ,
205+ "hex" : 0 ,
206+ "luarocks" : 0 ,
207+ "swift" : 0 ,
208+ }
209+
210+
162211def package_path_elements (purl : Union [PackageURL , str ]):
163212 """
164213 Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -196,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
196245 sbom.spdx.2.2.json : a SPDX SBOM
197246 .... other files
198247
199- <extra_path> : one sub directory for each quote-encoded <qualifiers#supath > if any
248+ <extra_path> : one sub directory for each quote-encoded <qualifiers#subpath > if any
200249 metadata.yml : ABOUT YAML file with package origin and license metadata for this version
201250 scancode-scan.yml : a scancode scan for this package version
202251 foo-scan.yml : a scan for this package version created with tool foo
@@ -208,15 +257,15 @@ def package_path_elements(purl: Union[PackageURL, str]):
208257 We keep the same prefix for different versions::
209258
210259 >>> package_path_elements("pkg:pypi/[email protected] ") 211- ('1050 ', 'pypi/license-expression', '30.3.1', '')
260+ ('50 ', 'pypi/license-expression', '30.3.1', '')
212261 >>> package_path_elements("pkg:pypi/[email protected] ") 213- ('1050 ', 'pypi/license-expression', '10.3.1', '')
262+ ('50 ', 'pypi/license-expression', '10.3.1', '')
214263
215264 We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier
216265 for filesystems::
217266
218267 >>> package_path_elements("pkg:pypi/[email protected] ?foo=bar&baz=bar#sub/path") 219- ('1050 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
268+ ('50 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
220269
221270 >>> purl = PackageURL(
222271 ... type="pypi",
@@ -225,12 +274,13 @@ def package_path_elements(purl: Union[PackageURL, str]):
225274 ... qualifiers=dict(foo="bar"),
226275 ... subpath="a/b/c")
227276 >>> package_path_elements(purl)
228- ('1050 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
277+ ('50 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
229278 """
230279 if isinstance (purl , str ):
231280 purl = PackageURL .from_string (purl )
232281
233- purl_hash = get_purl_hash (purl )
282+ bit_count = BIT_COUNT_BY_ECOSYSTEM .get (purl .type , 0 )
283+ purl_hash = get_purl_hash (purl = purl , _bit_count = bit_count )
234284
235285 if ns := purl .namespace :
236286 ns_name = f"{ ns } /{ purl .name } "
@@ -287,17 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
287337 return PackageURL (** purld )
288338
289339
290- def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 13 ) -> str :
340+ def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 0 ) -> str :
291341 """
292342 Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
293343 and we drop its version, qualifiers and subpath.
294344
295- This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
296- which represents 2**13 = 8192 possible hash values . It returns a fixed length short hash string
345+ This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
346+ which represents 2**0 = 1 possible hash value . It returns a fixed length short hash string
297347 that is left-padded with zeros.
298348
299349 The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
300- encoding of this bits count. For 13 bits, this means up to 4 characters.
350+ encoding of this bits count. For 10 bits, this means up to 3 characters.
301351
302352 The function is carefully designed to be portable across tech stacks and easy to implement in
303353 many programming languages:
@@ -319,23 +369,23 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
319369 For example::
320370
321371 The hash does not change with version or qualifiers::
322- >>> get_purl_hash("pkg:pypi/[email protected] ") 323- '1289 '
324- >>> get_purl_hash("pkg:pypi/[email protected] ") 325- '1289 '
326- >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path") 327- '1289 '
372+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 ) 373+ '09 '
374+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 ) 375+ '09 '
376+ >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path", 7 ) 377+ '09 '
328378
329379 The hash is left padded with zero if it::
330- >>> get_purl_hash("pkg:pypi/expressionss")
331- '0057 '
380+ >>> get_purl_hash("pkg:pypi/expressionss", 7 )
381+ '57 '
332382
333383 We normalize the PURL. Here pypi normalization always uses dash for underscore ::
334384
335- >>> get_purl_hash("pkg:pypi/license_expression")
336- '1050 '
337- >>> get_purl_hash("pkg:pypi/license-expression")
338- '1050 '
385+ >>> get_purl_hash("pkg:pypi/license_expression", 7 )
386+ '50 '
387+ >>> get_purl_hash("pkg:pypi/license-expression", 7 )
388+ '50 '
339389
340390 Originally from:
341391 https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
0 commit comments