Skip to content

Commit 9cfb624

Browse files
committed
Extract new aboutcode.hashids package
This helps with usage in FederatedCode, PurlDB and VulnerableCode Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent b7a7237 commit 9cfb624

File tree

5 files changed

+556
-64
lines changed

5 files changed

+556
-64
lines changed

aboutcode/hashid/__init__.py

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# Portions Copyright (c) The Python Software Foundation
4+
# VulnerableCode is a trademark of nexB Inc.
5+
# SPDX-License-Identifier: Apache-2.0 and Python-2.0
6+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
7+
# See https://github.com/nexB/vulnerablecode for support or download.
8+
# See https://aboutcode.org for more information about nexB OSS projects.
9+
#
10+
11+
from hashlib import sha256
12+
from math import ceil
13+
from pathlib import Path
14+
from typing import Union
15+
from urllib.parse import quote
16+
from uuid import uuid4
17+
18+
from packageurl import PackageURL
19+
from packageurl import normalize_qualifiers
20+
from packageurl import normalize_subpath
21+
22+
"""
23+
General purpose utilities to create Vulnerability Ids aka. VCID and content-defined, hash-based
24+
paths to store Vulnerability and Package data using these paths in many balanced directories.
25+
26+
The reason why this is needed is to store many vulnerability and package metadata files, we need
27+
to distribute these files in multiple directories and avoid too many files in the same directory
28+
which makes every filesystem performance suffer.
29+
30+
In addition, when storing these files in Git repositories, we need to avoid creating any repository
31+
with too many files that would make using this repository impactical or exceed the limits of some
32+
repository hosting services.
33+
34+
Therefore we are storing vulnerability data using a directory tree using the first few characters
35+
of the PURL hash of a package or the UUID of a vulnerability id.
36+
"""
37+
38+
VULNERABILITY_REPO_NAME = "aboutcode-vulnerabilities"
39+
40+
PACKAGE_REPOS_NAME_PREFIX = "aboutcode-packages"
41+
PURLS_FILENAME = "purls.yml"
42+
VULNERABILITIES_FILENAME = "vulnerabilities.yml"
43+
44+
45+
def build_vcid(prefix="VCID"):
46+
"""
47+
Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
48+
identifierstring using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49+
three segments composed of four letters and dihits each separated by a dash.
50+
51+
For example::
52+
>>> import re
53+
>>> vcid = build_vcid()
54+
>>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
55+
"""
56+
# we keep only 64 bits (e.g. 8 bytes)
57+
uid = sha256(uuid4().bytes).digest()[:8]
58+
# we keep only 12 encoded bytes (which corresponds to 60 bits)
59+
uid = base32_custom(uid)[:12].decode("utf-8").lower()
60+
return f"{prefix}-{uid[:4]}-{uid[4:8]}-{uid[8:12]}"
61+
62+
63+
def get_vcid_yml_file_path(vcid: str):
64+
"""
65+
Return the path to the vulnerability YAML file for a VCID.
66+
"""
67+
return Path(VULNERABILITY_REPO_NAME) / vulnerability_yml_path(vcid)
68+
69+
70+
# This cuxstom 32 characters alphabet is designed to avoid visually easily confusable characters:
71+
# i and l
72+
# 0 and o
73+
_base32_alphabet = b"abcdefghjkmnpqrstuvwxyz123456789"
74+
_b32tab = [bytes((i,)) for i in _base32_alphabet]
75+
_base32_table = [a + b for a in _b32tab for b in _b32tab]
76+
77+
base32_custom_alphabet = _base32_alphabet.decode("utf-8")
78+
79+
80+
def base32_custom(btes):
81+
"""
82+
Encode the ``btes`` bytes using a custom Base32 encoding with a custom alphabet and return a
83+
lowercase byte string. This alphabet is designed to avoid confusable characters.
84+
85+
Not meant for general purpose Base32 encoding as this is not designed to ever be decoded.
86+
Code copied and modified from the Python Standard Library: base64._b32encode function
87+
88+
For example::
89+
>>> base32_custom(b'abcd')
90+
b'abtze25e'
91+
92+
>>> base32_custom(b'abcde00000xxxxxPPPPP')
93+
b'pfugg3dfga2dapbtsb6ht8d2mbjfaxct'
94+
"""
95+
96+
encoded = bytearray()
97+
from_bytes = int.from_bytes
98+
99+
for i in range(0, len(btes), 5):
100+
c = from_bytes(btes[i : i + 5], "big") # big-endian
101+
encoded += (
102+
_base32_table[c >> 30] # bits 1 - 10
103+
+ _base32_table[(c >> 20) & 0x3FF] # bits 11 - 20
104+
+ _base32_table[(c >> 10) & 0x3FF] # bits 21 - 30
105+
+ _base32_table[c & 0x3FF] # bits 31 - 40
106+
)
107+
return bytes(encoded)
108+
109+
110+
def vulnerability_yml_path(vcid):
111+
"""
112+
Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
113+
114+
The approach is to distribute the files in many directories to avoid having too many files in
115+
any directory and be able to find the path to a vulneravility file given its VCID distributed on
116+
the first two characters of the UUID section of a VCID.
117+
118+
The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
119+
possibilities, meaning 1024 directories. Given a current count of vulnerabilities of about 300K,
120+
mid 2024 this gives ample distribution of about 1000 vulnerabilities in each of 1000 directories
121+
and plenty of room to grow.
122+
123+
The serialized vulnerability data should about 300MB compressed and should be storable in single
124+
Git repository.
125+
126+
For example::
127+
>> vulnerability_yml_path("VCID-s9bw-m429-aaaf")
128+
'VCID-s9bw-m429-aaaf.yml'
129+
"""
130+
prefix = vcid[5 : 5 + 2]
131+
return f"{prefix}/{vcid}.yml"
132+
133+
134+
def get_package_base_dir(purl: Union[PackageURL, str]):
135+
"""
136+
Return the base path to a Package directory (ignoring version) for a purl
137+
"""
138+
path_elements = package_path_elements(purl)
139+
phash, core_path, _pversion, _extra_path = path_elements
140+
return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{phash}") / core_path
141+
142+
143+
def get_package_purls_yml_file_path(purl: Union[PackageURL, str]):
144+
"""
145+
Return the path to a Package purls.yml YAML for a purl.
146+
"""
147+
return get_package_base_dir(purl) / PURLS_FILENAME
148+
149+
150+
def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
151+
"""
152+
Return the path to a Package vulnerabilities.yml YAML for a purl.
153+
"""
154+
return get_package_base_dir(purl) / VULNERABILITIES_FILENAME
155+
156+
157+
def package_path_elements(purl: Union[PackageURL, str]):
158+
"""
159+
Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
160+
The tuple members are: (purl_hash, core_path, purl.version, extra_path)
161+
These members can be joined using a POSIX "/" path separator to store package data distributed
162+
evenly in many directories, where package data of the same package is co-located in the same
163+
root directory.
164+
165+
The approach is to distribute the files in many directories to avoid having too many data files
166+
in any directory and be able to find the path to the YAML data files for a package given its
167+
PURL. For this we use the first characters of the "purl hash" to construct a path.
168+
169+
A purl hash has 8,192 possible values, meaning 8,192 directories or repositories, basically used
170+
as a hash table. Given an estimated count of packages of about 30 million in mid 2024, this
171+
gives ample distribution of about 4,000 packages in each of these top level directories and some
172+
room to grow.
173+
174+
The size to store compressed package metadata is guesstimated to be 1MB on average and 10MB for
175+
a full scan. This means that each directory will store 4K * 10MB ~= 4 GB. This should keep
176+
backing git repositories to a reasonable size, below 5GB.
177+
178+
The storage scheme is designed to create this path structure:
179+
180+
<short-purl-hash> : top level directory or repository
181+
<type>/<namespace>/<name> : sub directories
182+
purls.yml : YAML file with known versions for this package ordered from oldest to newest
183+
vulnerabilities.yml : YAML file with known vulnerabilities affecting (and fixed by) this package
184+
185+
<version> : one sub directory for each version
186+
metadata.yml : ABOUT YAML file with package origin and license metadata for this version
187+
scancode-scan.yml : a scancode scan for this package version
188+
foo-scan.yml : a scan for this package version created with tool foo
189+
sbom.cdx.1.4.json : a CycloneDX SBOM
190+
sbom.cdx.1.5.json : a CycloneDX SBOM
191+
sbom.spdx.2.2.json : a SPDX SBOM
192+
.... other files
193+
194+
<extra_path> : one sub directory for each quote-encoded <qualifiers#supath> if any
195+
metadata.yml : ABOUT YAML file with package origin and license metadata for this version
196+
scancode-scan.yml : a scancode scan for this package version
197+
foo-scan.yml : a scan for this package version created with tool foo
198+
sbom.cdx.1.4.json : a CycloneDX SBOM
199+
... other files
200+
201+
Some examples:
202+
203+
We keep the same prefix for different versions::
204+
205+
>>> package_path_elements("pkg:pypi/[email protected]")
206+
('1050', 'pypi/license-expression', '30.3.1', '')
207+
>>> package_path_elements("pkg:pypi/[email protected]")
208+
('1050', 'pypi/license-expression', '10.3.1', '')
209+
210+
We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier
211+
for filesystems::
212+
213+
>>> package_path_elements("pkg:pypi/[email protected]?foo=bar&baz=bar#sub/path")
214+
('1050', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
215+
216+
>>> purl = PackageURL(
217+
... type="pypi",
218+
... name="license_expression",
219+
... version="b#ar/?30.3.2!",
220+
... qualifiers=dict(foo="bar"),
221+
... subpath="a/b/c")
222+
>>> package_path_elements(purl)
223+
('1050', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
224+
"""
225+
if isinstance(purl, str):
226+
purl = PackageURL.from_string(purl)
227+
228+
purl_hash = get_purl_hash(purl)
229+
230+
if ns := purl.namespace:
231+
ns_name = f"{ns}/{purl.name}"
232+
else:
233+
ns_name = purl.name
234+
235+
extra_path = ""
236+
if pq := purl.qualifiers:
237+
# note that we percent-quote everything including the / character
238+
extra_path = quote_more(normalize_qualifiers(pq, encode=True))
239+
if psp := purl.subpath:
240+
psp = normalize_subpath(psp, encode=True)
241+
extra_path += quote_more(f"#{psp}")
242+
243+
core_path = f"{purl.type}/{ns_name}"
244+
245+
return purl_hash, core_path, quote_more(purl.version), extra_path
246+
247+
248+
def quote_more(qs):
249+
"""
250+
Return a quoted string from ``qs`` string by quoting all non-quoted characters ignoring already
251+
quoted characters. This makes the quoted string safer to use in a path.
252+
253+
For example::
254+
>>> quote_more("foo")
255+
'foo'
256+
257+
>>> quote_more("foo/bar")
258+
'foo%2Fbar'
259+
260+
>>> quote_more("foo%2Fbar")
261+
'foo%2Fbar'
262+
"""
263+
if not qs:
264+
return qs
265+
try:
266+
return quote(qs, safe="%")
267+
except Exception as e:
268+
raise Exception(f"Failed to quote_more: {qs!r}") from e
269+
270+
271+
def get_core_purl(purl: Union[PackageURL, str]):
272+
"""
273+
Return a new "core" purl from a ``purl`` object, dropping version, qualifiers and subpath.
274+
"""
275+
if isinstance(purl, str):
276+
purl = PackageURL.from_string(purl)
277+
278+
purld = purl.to_dict()
279+
del purld["version"]
280+
del purld["qualifiers"]
281+
del purld["subpath"]
282+
return PackageURL(**purld)
283+
284+
285+
def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
286+
"""
287+
Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
288+
and we drop its version, qualifiers and subpath.
289+
290+
This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
291+
which represents 2**13 = 8192 possible hash values. It returns a fixed length short hash string
292+
that is left-padded with zeros.
293+
294+
The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
295+
encoding of this bits count. For 13 bits, this means up to 4 characters.
296+
297+
The function is carefully designed to be portable across tech stacks and easy to implement in
298+
many programming languages:
299+
300+
- the hash is computed using sha256 which is available is all common language,
301+
- the hash is using simple lowercased HEX encoding,
302+
- we use simple arithmetics on integer with modulo.
303+
304+
The processing goes through these steps:
305+
306+
First, a SHA256 hash computed on the PURL bytes encoded as UTF-8.
307+
308+
Then, the hash digest bytes are converted to an integer, which is reduced modulo the largest
309+
possible value for the bit_count.
310+
311+
Finally, this number is converted to hex, left-padded with zero up to the hash_length, and
312+
returned as a lowercase string.
313+
314+
For example::
315+
316+
The hash does not change with version or qualifiers::
317+
>>> get_purl_hash("pkg:pypi/[email protected]")
318+
'1289'
319+
>>> get_purl_hash("pkg:pypi/[email protected]")
320+
'1289'
321+
>>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path")
322+
'1289'
323+
324+
The hash is left padded with zero if it::
325+
>>> get_purl_hash("pkg:pypi/expressionss")
326+
'0057'
327+
328+
We normalize the PURL. Here pypi normalization always uses dash for underscore ::
329+
330+
>>> get_purl_hash("pkg:pypi/license_expression")
331+
'1050'
332+
>>> get_purl_hash("pkg:pypi/license-expression")
333+
'1050'
334+
335+
Originally from:
336+
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
337+
"""
338+
339+
core_purl = get_core_purl(purl).to_string()
340+
# compute the hash from a UTF-8 encoded string
341+
purl_bytes = core_purl.encode("utf-8")
342+
hash_bytes = sha256(purl_bytes).digest()
343+
# ... converted to integer so we can truncate with modulo. Note that we use big endian.
344+
hash_int = int.from_bytes(hash_bytes, "big")
345+
# take a modulo based on bit count to truncate digest to the largest int value for the bitcount
346+
max_int = 2**_bit_count
347+
short_hash = hash_int % max_int
348+
# maximum number of hex characters in the hash string
349+
bits_per_hex_byte = 4
350+
num_chars_in_hash = ceil(_bit_count / bits_per_hex_byte)
351+
# return an hex "x" string left padded with 0
352+
return f"{short_hash:0{num_chars_in_hash}x}".lower()

aboutcode/hashid/__init__.py.ABOUT

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
about_resource: __init__.py
2+
notes: the base32_custom() function is derived from Python base64.py _b32encode function
3+
download_url: https://github.com/python/cpython/blob/77133f570dcad599e5b1199c39e999bfac959ae2/Lib/base64.py#L164
4+
purl: pkg:github.com/python/cpython@77133f570dcad599e5b1199c39e999bfac959ae2#/Lib/base64.py
5+
license_expression_spdx: Python-2.0
6+
license_expression: python
7+
copyright: Copyright (c) The Python Software Foundation

0 commit comments

Comments
 (0)