|
| 1 | +# |
| 2 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 3 | +# Portions Copyright (c) The Python Software Foundation |
| 4 | +# VulnerableCode is a trademark of nexB Inc. |
| 5 | +# SPDX-License-Identifier: Apache-2.0 and Python-2.0 |
| 6 | +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. |
| 7 | +# See https://github.com/nexB/vulnerablecode for support or download. |
| 8 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 9 | +# |
| 10 | + |
| 11 | +from hashlib import sha256 |
| 12 | +from math import ceil |
| 13 | +from pathlib import Path |
| 14 | +from typing import Union |
| 15 | +from urllib.parse import quote |
| 16 | +from uuid import uuid4 |
| 17 | + |
| 18 | +from packageurl import PackageURL |
| 19 | +from packageurl import normalize_qualifiers |
| 20 | +from packageurl import normalize_subpath |
| 21 | + |
| 22 | +""" |
| 23 | +General purpose utilities to create Vulnerability Ids aka. VCID and content-defined, hash-based |
| 24 | +paths to store Vulnerability and Package data using these paths in many balanced directories. |
| 25 | +
|
| 26 | +The reason why this is needed is to store many vulnerability and package metadata files, we need |
| 27 | +to distribute these files in multiple directories and avoid too many files in the same directory |
| 28 | +which makes every filesystem performance suffer. |
| 29 | +
|
| 30 | +In addition, when storing these files in Git repositories, we need to avoid creating any repository |
| 31 | +with too many files that would make using this repository impactical or exceed the limits of some |
| 32 | +repository hosting services. |
| 33 | +
|
| 34 | +Therefore we are storing vulnerability data using a directory tree using the first few characters |
| 35 | +of the PURL hash of a package or the UUID of a vulnerability id. |
| 36 | +""" |
| 37 | + |
| 38 | +VULNERABILITY_REPO_NAME = "aboutcode-vulnerabilities" |
| 39 | + |
| 40 | +PACKAGE_REPOS_NAME_PREFIX = "aboutcode-packages" |
| 41 | +PURLS_FILENAME = "purls.yml" |
| 42 | +VULNERABILITIES_FILENAME = "vulnerabilities.yml" |
| 43 | + |
| 44 | + |
| 45 | +def build_vcid(prefix="VCID"): |
| 46 | + """ |
| 47 | + Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability |
| 48 | + identifierstring using the provided ``prefix``. A VCID is composed of a four letter prefix, and |
| 49 | + three segments composed of four letters and dihits each separated by a dash. |
| 50 | +
|
| 51 | + For example:: |
| 52 | + >>> import re |
| 53 | + >>> vcid = build_vcid() |
| 54 | + >>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid |
| 55 | + """ |
| 56 | + # we keep only 64 bits (e.g. 8 bytes) |
| 57 | + uid = sha256(uuid4().bytes).digest()[:8] |
| 58 | + # we keep only 12 encoded bytes (which corresponds to 60 bits) |
| 59 | + uid = base32_custom(uid)[:12].decode("utf-8").lower() |
| 60 | + return f"{prefix}-{uid[:4]}-{uid[4:8]}-{uid[8:12]}" |
| 61 | + |
| 62 | + |
| 63 | +def get_vcid_yml_file_path(vcid: str): |
| 64 | + """ |
| 65 | + Return the path to the vulnerability YAML file for a VCID. |
| 66 | + """ |
| 67 | + return Path(VULNERABILITY_REPO_NAME) / vulnerability_yml_path(vcid) |
| 68 | + |
| 69 | + |
| 70 | +# This cuxstom 32 characters alphabet is designed to avoid visually easily confusable characters: |
| 71 | +# i and l |
| 72 | +# 0 and o |
| 73 | +_base32_alphabet = b"abcdefghjkmnpqrstuvwxyz123456789" |
| 74 | +_b32tab = [bytes((i,)) for i in _base32_alphabet] |
| 75 | +_base32_table = [a + b for a in _b32tab for b in _b32tab] |
| 76 | + |
| 77 | +base32_custom_alphabet = _base32_alphabet.decode("utf-8") |
| 78 | + |
| 79 | + |
| 80 | +def base32_custom(btes): |
| 81 | + """ |
| 82 | + Encode the ``btes`` bytes using a custom Base32 encoding with a custom alphabet and return a |
| 83 | + lowercase byte string. This alphabet is designed to avoid confusable characters. |
| 84 | +
|
| 85 | + Not meant for general purpose Base32 encoding as this is not designed to ever be decoded. |
| 86 | + Code copied and modified from the Python Standard Library: base64._b32encode function |
| 87 | +
|
| 88 | + For example:: |
| 89 | + >>> base32_custom(b'abcd') |
| 90 | + b'abtze25e' |
| 91 | +
|
| 92 | + >>> base32_custom(b'abcde00000xxxxxPPPPP') |
| 93 | + b'pfugg3dfga2dapbtsb6ht8d2mbjfaxct' |
| 94 | + """ |
| 95 | + |
| 96 | + encoded = bytearray() |
| 97 | + from_bytes = int.from_bytes |
| 98 | + |
| 99 | + for i in range(0, len(btes), 5): |
| 100 | + c = from_bytes(btes[i : i + 5], "big") # big-endian |
| 101 | + encoded += ( |
| 102 | + _base32_table[c >> 30] # bits 1 - 10 |
| 103 | + + _base32_table[(c >> 20) & 0x3FF] # bits 11 - 20 |
| 104 | + + _base32_table[(c >> 10) & 0x3FF] # bits 21 - 30 |
| 105 | + + _base32_table[c & 0x3FF] # bits 31 - 40 |
| 106 | + ) |
| 107 | + return bytes(encoded) |
| 108 | + |
| 109 | + |
| 110 | +def vulnerability_yml_path(vcid): |
| 111 | + """ |
| 112 | + Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id. |
| 113 | +
|
| 114 | + The approach is to distribute the files in many directories to avoid having too many files in |
| 115 | + any directory and be able to find the path to a vulneravility file given its VCID distributed on |
| 116 | + the first two characters of the UUID section of a VCID. |
| 117 | +
|
| 118 | + The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024 |
| 119 | + possibilities, meaning 1024 directories. Given a current count of vulnerabilities of about 300K, |
| 120 | + mid 2024 this gives ample distribution of about 1000 vulnerabilities in each of 1000 directories |
| 121 | + and plenty of room to grow. |
| 122 | +
|
| 123 | + The serialized vulnerability data should about 300MB compressed and should be storable in single |
| 124 | + Git repository. |
| 125 | +
|
| 126 | + For example:: |
| 127 | + >> vulnerability_yml_path("VCID-s9bw-m429-aaaf") |
| 128 | + 'VCID-s9bw-m429-aaaf.yml' |
| 129 | + """ |
| 130 | + prefix = vcid[5 : 5 + 2] |
| 131 | + return f"{prefix}/{vcid}.yml" |
| 132 | + |
| 133 | + |
| 134 | +def get_package_base_dir(purl: Union[PackageURL, str]): |
| 135 | + """ |
| 136 | + Return the base path to a Package directory (ignoring version) for a purl |
| 137 | + """ |
| 138 | + path_elements = package_path_elements(purl) |
| 139 | + phash, core_path, _pversion, _extra_path = path_elements |
| 140 | + return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{phash}") / core_path |
| 141 | + |
| 142 | + |
| 143 | +def get_package_purls_yml_file_path(purl: Union[PackageURL, str]): |
| 144 | + """ |
| 145 | + Return the path to a Package purls.yml YAML for a purl. |
| 146 | + """ |
| 147 | + return get_package_base_dir(purl) / PURLS_FILENAME |
| 148 | + |
| 149 | + |
| 150 | +def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]): |
| 151 | + """ |
| 152 | + Return the path to a Package vulnerabilities.yml YAML for a purl. |
| 153 | + """ |
| 154 | + return get_package_base_dir(purl) / VULNERABILITIES_FILENAME |
| 155 | + |
| 156 | + |
| 157 | +def package_path_elements(purl: Union[PackageURL, str]): |
| 158 | + """ |
| 159 | + Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object. |
| 160 | + The tuple members are: (purl_hash, core_path, purl.version, extra_path) |
| 161 | + These members can be joined using a POSIX "/" path separator to store package data distributed |
| 162 | + evenly in many directories, where package data of the same package is co-located in the same |
| 163 | + root directory. |
| 164 | +
|
| 165 | + The approach is to distribute the files in many directories to avoid having too many data files |
| 166 | + in any directory and be able to find the path to the YAML data files for a package given its |
| 167 | + PURL. For this we use the first characters of the "purl hash" to construct a path. |
| 168 | +
|
| 169 | + A purl hash has 8,192 possible values, meaning 8,192 directories or repositories, basically used |
| 170 | + as a hash table. Given an estimated count of packages of about 30 million in mid 2024, this |
| 171 | + gives ample distribution of about 4,000 packages in each of these top level directories and some |
| 172 | + room to grow. |
| 173 | +
|
| 174 | + The size to store compressed package metadata is guesstimated to be 1MB on average and 10MB for |
| 175 | + a full scan. This means that each directory will store 4K * 10MB ~= 4 GB. This should keep |
| 176 | + backing git repositories to a reasonable size, below 5GB. |
| 177 | +
|
| 178 | + The storage scheme is designed to create this path structure: |
| 179 | +
|
| 180 | + <short-purl-hash> : top level directory or repository |
| 181 | + <type>/<namespace>/<name> : sub directories |
| 182 | + purls.yml : YAML file with known versions for this package ordered from oldest to newest |
| 183 | + vulnerabilities.yml : YAML file with known vulnerabilities affecting (and fixed by) this package |
| 184 | +
|
| 185 | + <version> : one sub directory for each version |
| 186 | + metadata.yml : ABOUT YAML file with package origin and license metadata for this version |
| 187 | + scancode-scan.yml : a scancode scan for this package version |
| 188 | + foo-scan.yml : a scan for this package version created with tool foo |
| 189 | + sbom.cdx.1.4.json : a CycloneDX SBOM |
| 190 | + sbom.cdx.1.5.json : a CycloneDX SBOM |
| 191 | + sbom.spdx.2.2.json : a SPDX SBOM |
| 192 | + .... other files |
| 193 | +
|
| 194 | + <extra_path> : one sub directory for each quote-encoded <qualifiers#supath> if any |
| 195 | + metadata.yml : ABOUT YAML file with package origin and license metadata for this version |
| 196 | + scancode-scan.yml : a scancode scan for this package version |
| 197 | + foo-scan.yml : a scan for this package version created with tool foo |
| 198 | + sbom.cdx.1.4.json : a CycloneDX SBOM |
| 199 | + ... other files |
| 200 | +
|
| 201 | + Some examples: |
| 202 | +
|
| 203 | + We keep the same prefix for different versions:: |
| 204 | +
|
| 205 | + >>> package_path_elements("pkg:pypi/[email protected]") |
| 206 | + ('1050', 'pypi/license-expression', '30.3.1', '') |
| 207 | + >>> package_path_elements("pkg:pypi/[email protected]") |
| 208 | + ('1050', 'pypi/license-expression', '10.3.1', '') |
| 209 | +
|
| 210 | + We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier |
| 211 | + for filesystems:: |
| 212 | +
|
| 213 | + >>> package_path_elements("pkg:pypi/[email protected]?foo=bar&baz=bar#sub/path") |
| 214 | + ('1050', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath') |
| 215 | +
|
| 216 | + >>> purl = PackageURL( |
| 217 | + ... type="pypi", |
| 218 | + ... name="license_expression", |
| 219 | + ... version="b#ar/?30.3.2!", |
| 220 | + ... qualifiers=dict(foo="bar"), |
| 221 | + ... subpath="a/b/c") |
| 222 | + >>> package_path_elements(purl) |
| 223 | + ('1050', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc') |
| 224 | + """ |
| 225 | + if isinstance(purl, str): |
| 226 | + purl = PackageURL.from_string(purl) |
| 227 | + |
| 228 | + purl_hash = get_purl_hash(purl) |
| 229 | + |
| 230 | + if ns := purl.namespace: |
| 231 | + ns_name = f"{ns}/{purl.name}" |
| 232 | + else: |
| 233 | + ns_name = purl.name |
| 234 | + |
| 235 | + extra_path = "" |
| 236 | + if pq := purl.qualifiers: |
| 237 | + # note that we percent-quote everything including the / character |
| 238 | + extra_path = quote_more(normalize_qualifiers(pq, encode=True)) |
| 239 | + if psp := purl.subpath: |
| 240 | + psp = normalize_subpath(psp, encode=True) |
| 241 | + extra_path += quote_more(f"#{psp}") |
| 242 | + |
| 243 | + core_path = f"{purl.type}/{ns_name}" |
| 244 | + |
| 245 | + return purl_hash, core_path, quote_more(purl.version), extra_path |
| 246 | + |
| 247 | + |
| 248 | +def quote_more(qs): |
| 249 | + """ |
| 250 | + Return a quoted string from ``qs`` string by quoting all non-quoted characters ignoring already |
| 251 | + quoted characters. This makes the quoted string safer to use in a path. |
| 252 | +
|
| 253 | + For example:: |
| 254 | + >>> quote_more("foo") |
| 255 | + 'foo' |
| 256 | +
|
| 257 | + >>> quote_more("foo/bar") |
| 258 | + 'foo%2Fbar' |
| 259 | +
|
| 260 | + >>> quote_more("foo%2Fbar") |
| 261 | + 'foo%2Fbar' |
| 262 | + """ |
| 263 | + if not qs: |
| 264 | + return qs |
| 265 | + try: |
| 266 | + return quote(qs, safe="%") |
| 267 | + except Exception as e: |
| 268 | + raise Exception(f"Failed to quote_more: {qs!r}") from e |
| 269 | + |
| 270 | + |
| 271 | +def get_core_purl(purl: Union[PackageURL, str]): |
| 272 | + """ |
| 273 | + Return a new "core" purl from a ``purl`` object, dropping version, qualifiers and subpath. |
| 274 | + """ |
| 275 | + if isinstance(purl, str): |
| 276 | + purl = PackageURL.from_string(purl) |
| 277 | + |
| 278 | + purld = purl.to_dict() |
| 279 | + del purld["version"] |
| 280 | + del purld["qualifiers"] |
| 281 | + del purld["subpath"] |
| 282 | + return PackageURL(**purld) |
| 283 | + |
| 284 | + |
| 285 | +def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str: |
| 286 | + """ |
| 287 | + Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized |
| 288 | + and we drop its version, qualifiers and subpath. |
| 289 | +
|
| 290 | + This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits |
| 291 | + which represents 2**13 = 8192 possible hash values. It returns a fixed length short hash string |
| 292 | + that is left-padded with zeros. |
| 293 | +
|
| 294 | + The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex |
| 295 | + encoding of this bits count. For 13 bits, this means up to 4 characters. |
| 296 | +
|
| 297 | + The function is carefully designed to be portable across tech stacks and easy to implement in |
| 298 | + many programming languages: |
| 299 | +
|
| 300 | + - the hash is computed using sha256 which is available is all common language, |
| 301 | + - the hash is using simple lowercased HEX encoding, |
| 302 | + - we use simple arithmetics on integer with modulo. |
| 303 | +
|
| 304 | + The processing goes through these steps: |
| 305 | +
|
| 306 | + First, a SHA256 hash computed on the PURL bytes encoded as UTF-8. |
| 307 | +
|
| 308 | + Then, the hash digest bytes are converted to an integer, which is reduced modulo the largest |
| 309 | + possible value for the bit_count. |
| 310 | +
|
| 311 | + Finally, this number is converted to hex, left-padded with zero up to the hash_length, and |
| 312 | + returned as a lowercase string. |
| 313 | +
|
| 314 | + For example:: |
| 315 | +
|
| 316 | + The hash does not change with version or qualifiers:: |
| 317 | + >>> get_purl_hash("pkg:pypi/[email protected]") |
| 318 | + '1289' |
| 319 | + >>> get_purl_hash("pkg:pypi/[email protected]") |
| 320 | + '1289' |
| 321 | + >>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path") |
| 322 | + '1289' |
| 323 | +
|
| 324 | + The hash is left padded with zero if it:: |
| 325 | + >>> get_purl_hash("pkg:pypi/expressionss") |
| 326 | + '0057' |
| 327 | +
|
| 328 | + We normalize the PURL. Here pypi normalization always uses dash for underscore :: |
| 329 | +
|
| 330 | + >>> get_purl_hash("pkg:pypi/license_expression") |
| 331 | + '1050' |
| 332 | + >>> get_purl_hash("pkg:pypi/license-expression") |
| 333 | + '1050' |
| 334 | +
|
| 335 | + Originally from: |
| 336 | + https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154 |
| 337 | + """ |
| 338 | + |
| 339 | + core_purl = get_core_purl(purl).to_string() |
| 340 | + # compute the hash from a UTF-8 encoded string |
| 341 | + purl_bytes = core_purl.encode("utf-8") |
| 342 | + hash_bytes = sha256(purl_bytes).digest() |
| 343 | + # ... converted to integer so we can truncate with modulo. Note that we use big endian. |
| 344 | + hash_int = int.from_bytes(hash_bytes, "big") |
| 345 | + # take a modulo based on bit count to truncate digest to the largest int value for the bitcount |
| 346 | + max_int = 2**_bit_count |
| 347 | + short_hash = hash_int % max_int |
| 348 | + # maximum number of hex characters in the hash string |
| 349 | + bits_per_hex_byte = 4 |
| 350 | + num_chars_in_hash = ceil(_bit_count / bits_per_hex_byte) |
| 351 | + # return an hex "x" string left padded with 0 |
| 352 | + return f"{short_hash:0{num_chars_in_hash}x}".lower() |
0 commit comments