fix: improve robustness of version compare (#3694)

terriko · web-flow · commit 72c198ce878a · 2024-01-22T12:07:50.000-08:00
* fix: improve version compare robustness
* fix: add missing docstrings 
* fix: improve prerelease handling
* fix: remove hash detection, add tests &amp; experiment script

---------

Signed-off-by: Terri Oda &lt;terri.oda@intel.com&gt;
diff --git a/cve_bin_tool/version_compare.py b/cve_bin_tool/version_compare.py
@@ -9,8 +9,9 @@
 Splits versions up using common whitespace delimiters and also splits out letters
 so that things like openSSL's 1.1.1y type of version will work too.
 
-This may need some additional smarts for stuff like "rc" or "beta" and potentially for
-things like distro versioning.  I don't know yet.
+This handles some pretty strange edge cases.  See the test_version_compare.py
+and inline comments for details
+
 """
 
 
@@ -38,66 +39,25 @@ def parse_version(version_string: str):
         raise UnknownVersion(f"version string = {version_string}")
 
     versionString = version_string.strip()
-    versionArray = []
 
     # convert all non alpha-numeric characters to be treated like . below
     # we could switch to a re split but it seems to leave blanks so this is less hassle
-    versionString = re.sub("[^0-9a-zA-Z]+", ".", versionString)
-
     # Note: This expression may need improvement if we need to handle unicode
+    versionString = re.sub("[^0-9a-zA-Z]+", ".", versionString)
 
-    # remove any trailing . then split
-    versionString = versionString.strip(".")
-    split_version = versionString.split(".")
-
-    # if the whole string was numeric then we're done and you can move on
-    if versionString.isnumeric():
-        versionArray = split_version
-        return versionArray
-
-    # Go through and split up anything like 6a in to 6 and a
-    number_letter = re.compile("^([0-9]+)([a-zA-Z]+)$")
-    letter_number = re.compile("^([a-zA-Z]+)([0-9]+)$")
-    for section in split_version:
-        # if it's all letters or all numbers, just add it to the array
-        if section.isnumeric() or section.isalpha():
-            versionArray.append(section)
-
-        # if it looks like 42a split out the letters and numbers
-        # We will treat 42a as coming *after* version 42.
-        elif re.match(number_letter, section):
-            result = re.findall(number_letter, section)
-
-            # We're expecting a result that looks like [("42", "a")] but let's verify
-            # and then add it to the array
-            if len(result) == 1 and len(result[0]) == 2:
-                versionArray.append(result[0][0])
-                versionArray.append(result[0][1])
-            else:
-                raise CannotParseVersionException(f"version string = {versionString}")
-
-        # if it looks like rc1 or dev7 we'll leave it together as it may be some kind of pre-release
-        # and we'll probably want to handle it specially in the compare.
-        # We need to threat 42dev7 as coming *before* version 42.
-        elif re.match(letter_number, section):
-            versionArray.append(section)
-
-        # It's not a "pure" alpha or number string, it's not something like rc12 or 44g
+    # We originally had hash detection in here, but it turns out very few companies
+    # use hashes in ranges but more used dates that were getting caught in the same net
+    # (see https://github.com/intel/cve-bin-tool/pull/3694 )
+    # Hash deteciton may be useful in the future but it would have to be better defined.
 
-        # It could be a hash, which we can't string compare without knowledge of the product.
-        # It could also be a distro release string like deb8u5, which we could compare
-        # but the data may not be useful or usable in context.
-        else:
-            # If it's the last part of the version just drop it silently
-            # we could log these but I suspect it would be very noisy
-            if section == split_version[len(split_version) - 1]:
-                pass
+    # otherwise, split up letters and numbers into separate units for compare
+    versionString = re.sub("([a-zA-Z]+)", r".\1.", versionString)
 
-            # if it's not, raise an exception because we should probably examine it
-            elif versionString != ".":
-                raise CannotParseVersionException(f"version string = {versionString}")
+    # Clean up any duplicate . and then split
+    versionString = re.sub(r"\.+", ".", versionString)
+    split_version = versionString.strip(".").split(".")
 
-    return versionArray
+    return split_version
 
 
 def version_compare(v1: str, v2: str):
@@ -106,12 +66,14 @@ def version_compare(v1: str, v2: str):
 
     returns 0 if they're the same.
     returns 1 if v1 > v2
-    returns -1 if v1 < v2findall
-    n
+    returns -1 if v1 < v2
     """
     v1_array = parse_version(v1)
     v2_array = parse_version(v2)
 
+    # We'll treat the following strings as pre-releases.
+    pre_release_words = {"pre", "rc", "alpha", "beta", "dev"}
+
     for i in range(len(v1_array)):
         if len(v2_array) > i:
             # If it's all numbers, cast to int and compare
@@ -121,46 +83,44 @@ def version_compare(v1: str, v2: str):
                 if int(v1_array[i]) < int(v2_array[i]):
                     return -1
 
-            # If they're letters just do a string compare, I don't have a better idea
+            # If they're letters do a string compare.
             # This might be a bad choice in some cases: Do we want ag < z?
             # I suspect projects using letters in version names may not use ranges in nvd
             # for this reason (e.g. openssl)
             # Converting to lower() so that 3.14a == 3.14A
             # but this may not be ideal in all cases
             elif v1_array[i].isalpha() and v2_array[i].isalpha():
+                # allow pre-releases to come before arbitrary letters.
+                if (
+                    v1_array[i] in pre_release_words
+                    and v2_array[i] not in pre_release_words
+                ):
+                    return -1
+                if (
+                    v1_array[i] not in pre_release_words
+                    and v2_array[i] in pre_release_words
+                ):
+                    return 1
+
+                # Note that if both are in the pre-release list we alpha compare
                 if v1_array[i].lower() > v2_array[i].lower():
                     return 1
                 if v1_array[i].lower() < v2_array[i].lower():
                     return -1
 
             else:
                 # They are not the same type, and we're comparing mixed letters and numbers.
-                # We'll treat letters as less than numbers.
-                # This will result in things like rc1, dev9, b2 getting treated like pre-releases
-                # as in https://peps.python.org/pep-0440/
-                # So 1.2.pre4 would be less than 1.2.1 and (so would 1.2.post1)
+                # We treat letters less than numbers
+
+                # This may cause false positives with some distro numbers
+                # e.g. 1.4.ubuntu8 may have fixed some issues in 1.4,
+                # But since we can't be sure we'll return the 'safer' result
+                # and let users triage themselves.
                 if v1_array[i].isalnum() and v2_array[i].isnumeric():
                     return -1
                 elif v1_array[i].isnumeric() and v2_array[i].isalnum():
                     return 1
 
-                # They're both of type letter567 and we'll convert them to be letter.567 and
-                # run them through the compare function again
-                # We will be dictionary comparing so that 4.alpha4 < 4.beta1
-                # but this also means .dev3 < .rc4 (because d is before r)
-                # which may make less sense depending on the project.
-                letter_number = re.compile("^[a-zA-Z]+[0-9]+$")
-                if re.match(letter_number, v1_array[i]) and re.match(
-                    letter_number, v2_array[i]
-                ):
-                    v1_letter_number = re.sub(
-                        "([a-zA-Z]+)([0-9]+)", r"\1.\2", v1_array[i]
-                    )
-                    v2_letter_number = re.sub(
-                        "([a-zA-Z]+)([0-9]+)", r"\1.\2", v2_array[i]
-                    )
-                    return version_compare(v1_letter_number, v2_letter_number)
-
                 # And if all else fails, just compare the strings
                 if v1_array[i] > v2_array[i]:
                     return 1
@@ -171,7 +131,7 @@ def version_compare(v1: str, v2: str):
             # v1 has more digits than v2
             # Check to see if v1's something that looks like a pre-release (a2, dev8, rc4)
             # e.g. 4.5.a1 would be less than 4.5
-            if re.match("([a-zA-Z]+)([0-9]+)", v1_array[i]):
+            if v1_array[i] in pre_release_words:
                 return -1
 
             # Otherwise, v1 has more digits than v2 and the previous ones matched,
@@ -185,9 +145,9 @@ def version_compare(v1: str, v2: str):
         if v2_array[len(v1_array)].startswith("post"):
             return -1
 
-        # if what's in v2 next looks like a pre-release number (e.g. a2, dev8, rc4) then we'll
+        # if what's in v2 next looks like a pre-release then we'll
         # claim v1 is still bigger, otherwise we'll say v2 is.
-        if re.match("([0-9]+)([a-zA-Z]+)", v2_array[len(v1_array)]):
+        if v2_array[len(v1_array)] in pre_release_words:
             return 1
 
         return -1
@@ -232,4 +192,4 @@ def __ne__(self, other):
 
     def __repr__(self):
         """print the version string"""
-        return f"Version: {self}"
+        return f"Version: {self} aka {parse_version(self)}"
diff --git a/experiments/sqlite-experiments.py b/experiments/sqlite-experiments.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""
+A lazy script for searching the database via regexes.
+
+This particular version was used to support the conclusion that my hash detection attempt
+in a version_compare PR would do more harm than good, but I'm checking it in so people can modify
+it for other data searches in future.
+
+- Terri Oda
+"""
+
+import re
+import sqlite3
+
+dbcon = sqlite3.connect("/home/terri/.cache/cve-bin-tool/cve.db")
+dbcon.create_function("regexp", 2, lambda x, y: 1 if re.search(x, y) else 0)
+cursor = dbcon.cursor()
+
+print("StartIncluding ===========")
+cursor.execute(
+    "select vendor, product, versionStartIncluding from cve_range where versionStartIncluding REGEXP '[0-9a-fA-F]{8}'"
+)
+results = cursor.fetchall()
+for i in results:
+    print(i)
+
+print("StartExcluding ===========")
+cursor.execute(
+    "select vendor, product, versionStartExcluding from cve_range where versionStartExcluding REGEXP '[0-9a-fA-F]{8}'"
+)
+results = cursor.fetchall()
+for i in results:
+    print(i)
+
+print("EndExcluding ===========")
+cursor.execute(
+    "select vendor, product, versionEndExcluding from cve_range where versionEndExcluding REGEXP '[0-9a-fA-F]{8}'"
+)
+results = cursor.fetchall()
+for i in results:
+    print(i)
+
+print("EndIncluding ===========")
+cursor.execute(
+    "select vendor, product, versionEndIncluding from cve_range where versionEndIncluding REGEXP '[0-9a-fA-F]{8}'"
+)
+results = cursor.fetchall()
+for i in results:
+    print(i)
diff --git a/test/test_version_compare.py b/test/test_version_compare.py
@@ -15,6 +15,7 @@ def test_eq(self):
         assert Version("1.1a") == Version("1.1A")
         assert Version("4.4.A") == Version("4.4.a")
         assert Version("5.6   ") == Version("5.6")
+        assert Version("f835f2caaa") == Version("f835f2caaa")
 
     def test_lt(self):
         """Make sure < works between versions, including some with unusual version schemes"""
@@ -36,6 +37,11 @@ def test_lt(self):
         )
         assert Version("1.1.0l.1~deb9u2") < Version("2.0.0-1+deb9u1")
         assert Version("1.1.0l.1~deb9u2") < Version("1.1.0m")
+        assert Version("8.9~deb7u9") < Version("8.9~deb9u6")
+        assert Version("8.9~deb7u9") < Version("8.9~deb9u6")
+        assert Version("3.9.pre1") < Version("3.9.u")
+        assert Version("3.9.rc1") < Version("3.9.g")
+        assert Version("pre4") < Version("3")
 
     def test_gt(self):
         """Make sure > works between versions, including some with unusual version schemes"""
@@ -53,10 +59,19 @@ def test_gt(self):
             "0.0.0.20190813141303.74dc4d7220e7"
         )
         assert Version("1.1.0m") > Version("1.1.0l.1~deb9u2")
+        assert Version("8.9~deb9u6") > Version("8.9~deb7u9")
+        assert Version("3.9.u") > Version("3.9.pre1")
+        assert Version("3.9.g") > Version("3.9.rc1")
+        assert Version("2") > Version("pre3")
 
     def test_error(self):
         """Make sure 'unknown' and blank strings raise appropriate errors"""
         with pytest.raises(UnknownVersion):
             Version("6") > Version("unknown")
         with pytest.raises(UnknownVersion):
             Version("") > Version("6")
+
+    def test_ne(self):
+        """Test some != cases with hashes to make sure we aren't comparing the string 'HASH'"""
+        assert Version("f835f2caab") != Version("f835f2caaa")
+        assert Version("HASH") != Version("f835f2caaa")