Skip to content

Commit 72c198c

Browse files
authored
fix: improve robustness of version compare (#3694)
* fix: improve version compare robustness * fix: add missing docstrings * fix: improve prerelease handling * fix: remove hash detection, add tests & experiment script --------- Signed-off-by: Terri Oda <[email protected]>
1 parent 666a3f8 commit 72c198c

File tree

3 files changed

+108
-82
lines changed

3 files changed

+108
-82
lines changed

cve_bin_tool/version_compare.py

Lines changed: 42 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
Splits versions up using common whitespace delimiters and also splits out letters
1010
so that things like openSSL's 1.1.1y type of version will work too.
1111
12-
This may need some additional smarts for stuff like "rc" or "beta" and potentially for
13-
things like distro versioning. I don't know yet.
12+
This handles some pretty strange edge cases. See the test_version_compare.py
13+
and inline comments for details
14+
1415
"""
1516

1617

@@ -38,66 +39,25 @@ def parse_version(version_string: str):
3839
raise UnknownVersion(f"version string = {version_string}")
3940

4041
versionString = version_string.strip()
41-
versionArray = []
4242

4343
# convert all non alpha-numeric characters to be treated like . below
4444
# we could switch to a re split but it seems to leave blanks so this is less hassle
45-
versionString = re.sub("[^0-9a-zA-Z]+", ".", versionString)
46-
4745
# Note: This expression may need improvement if we need to handle unicode
46+
versionString = re.sub("[^0-9a-zA-Z]+", ".", versionString)
4847

49-
# remove any trailing . then split
50-
versionString = versionString.strip(".")
51-
split_version = versionString.split(".")
52-
53-
# if the whole string was numeric then we're done and you can move on
54-
if versionString.isnumeric():
55-
versionArray = split_version
56-
return versionArray
57-
58-
# Go through and split up anything like 6a in to 6 and a
59-
number_letter = re.compile("^([0-9]+)([a-zA-Z]+)$")
60-
letter_number = re.compile("^([a-zA-Z]+)([0-9]+)$")
61-
for section in split_version:
62-
# if it's all letters or all numbers, just add it to the array
63-
if section.isnumeric() or section.isalpha():
64-
versionArray.append(section)
65-
66-
# if it looks like 42a split out the letters and numbers
67-
# We will treat 42a as coming *after* version 42.
68-
elif re.match(number_letter, section):
69-
result = re.findall(number_letter, section)
70-
71-
# We're expecting a result that looks like [("42", "a")] but let's verify
72-
# and then add it to the array
73-
if len(result) == 1 and len(result[0]) == 2:
74-
versionArray.append(result[0][0])
75-
versionArray.append(result[0][1])
76-
else:
77-
raise CannotParseVersionException(f"version string = {versionString}")
78-
79-
# if it looks like rc1 or dev7 we'll leave it together as it may be some kind of pre-release
80-
# and we'll probably want to handle it specially in the compare.
81-
# We need to threat 42dev7 as coming *before* version 42.
82-
elif re.match(letter_number, section):
83-
versionArray.append(section)
84-
85-
# It's not a "pure" alpha or number string, it's not something like rc12 or 44g
48+
# We originally had hash detection in here, but it turns out very few companies
49+
# use hashes in ranges but more used dates that were getting caught in the same net
50+
# (see https://github.com/intel/cve-bin-tool/pull/3694 )
51+
# Hash deteciton may be useful in the future but it would have to be better defined.
8652

87-
# It could be a hash, which we can't string compare without knowledge of the product.
88-
# It could also be a distro release string like deb8u5, which we could compare
89-
# but the data may not be useful or usable in context.
90-
else:
91-
# If it's the last part of the version just drop it silently
92-
# we could log these but I suspect it would be very noisy
93-
if section == split_version[len(split_version) - 1]:
94-
pass
53+
# otherwise, split up letters and numbers into separate units for compare
54+
versionString = re.sub("([a-zA-Z]+)", r".\1.", versionString)
9555

96-
# if it's not, raise an exception because we should probably examine it
97-
elif versionString != ".":
98-
raise CannotParseVersionException(f"version string = {versionString}")
56+
# Clean up any duplicate . and then split
57+
versionString = re.sub(r"\.+", ".", versionString)
58+
split_version = versionString.strip(".").split(".")
9959

100-
return versionArray
60+
return split_version
10161

10262

10363
def version_compare(v1: str, v2: str):
@@ -106,12 +66,14 @@ def version_compare(v1: str, v2: str):
10666
10767
returns 0 if they're the same.
10868
returns 1 if v1 > v2
109-
returns -1 if v1 < v2findall
110-
n
69+
returns -1 if v1 < v2
11170
"""
11271
v1_array = parse_version(v1)
11372
v2_array = parse_version(v2)
11473

74+
# We'll treat the following strings as pre-releases.
75+
pre_release_words = {"pre", "rc", "alpha", "beta", "dev"}
76+
11577
for i in range(len(v1_array)):
11678
if len(v2_array) > i:
11779
# If it's all numbers, cast to int and compare
@@ -121,46 +83,44 @@ def version_compare(v1: str, v2: str):
12183
if int(v1_array[i]) < int(v2_array[i]):
12284
return -1
12385

124-
# If they're letters just do a string compare, I don't have a better idea
86+
# If they're letters do a string compare.
12587
# This might be a bad choice in some cases: Do we want ag < z?
12688
# I suspect projects using letters in version names may not use ranges in nvd
12789
# for this reason (e.g. openssl)
12890
# Converting to lower() so that 3.14a == 3.14A
12991
# but this may not be ideal in all cases
13092
elif v1_array[i].isalpha() and v2_array[i].isalpha():
93+
# allow pre-releases to come before arbitrary letters.
94+
if (
95+
v1_array[i] in pre_release_words
96+
and v2_array[i] not in pre_release_words
97+
):
98+
return -1
99+
if (
100+
v1_array[i] not in pre_release_words
101+
and v2_array[i] in pre_release_words
102+
):
103+
return 1
104+
105+
# Note that if both are in the pre-release list we alpha compare
131106
if v1_array[i].lower() > v2_array[i].lower():
132107
return 1
133108
if v1_array[i].lower() < v2_array[i].lower():
134109
return -1
135110

136111
else:
137112
# They are not the same type, and we're comparing mixed letters and numbers.
138-
# We'll treat letters as less than numbers.
139-
# This will result in things like rc1, dev9, b2 getting treated like pre-releases
140-
# as in https://peps.python.org/pep-0440/
141-
# So 1.2.pre4 would be less than 1.2.1 and (so would 1.2.post1)
113+
# We treat letters less than numbers
114+
115+
# This may cause false positives with some distro numbers
116+
# e.g. 1.4.ubuntu8 may have fixed some issues in 1.4,
117+
# But since we can't be sure we'll return the 'safer' result
118+
# and let users triage themselves.
142119
if v1_array[i].isalnum() and v2_array[i].isnumeric():
143120
return -1
144121
elif v1_array[i].isnumeric() and v2_array[i].isalnum():
145122
return 1
146123

147-
# They're both of type letter567 and we'll convert them to be letter.567 and
148-
# run them through the compare function again
149-
# We will be dictionary comparing so that 4.alpha4 < 4.beta1
150-
# but this also means .dev3 < .rc4 (because d is before r)
151-
# which may make less sense depending on the project.
152-
letter_number = re.compile("^[a-zA-Z]+[0-9]+$")
153-
if re.match(letter_number, v1_array[i]) and re.match(
154-
letter_number, v2_array[i]
155-
):
156-
v1_letter_number = re.sub(
157-
"([a-zA-Z]+)([0-9]+)", r"\1.\2", v1_array[i]
158-
)
159-
v2_letter_number = re.sub(
160-
"([a-zA-Z]+)([0-9]+)", r"\1.\2", v2_array[i]
161-
)
162-
return version_compare(v1_letter_number, v2_letter_number)
163-
164124
# And if all else fails, just compare the strings
165125
if v1_array[i] > v2_array[i]:
166126
return 1
@@ -171,7 +131,7 @@ def version_compare(v1: str, v2: str):
171131
# v1 has more digits than v2
172132
# Check to see if v1's something that looks like a pre-release (a2, dev8, rc4)
173133
# e.g. 4.5.a1 would be less than 4.5
174-
if re.match("([a-zA-Z]+)([0-9]+)", v1_array[i]):
134+
if v1_array[i] in pre_release_words:
175135
return -1
176136

177137
# Otherwise, v1 has more digits than v2 and the previous ones matched,
@@ -185,9 +145,9 @@ def version_compare(v1: str, v2: str):
185145
if v2_array[len(v1_array)].startswith("post"):
186146
return -1
187147

188-
# if what's in v2 next looks like a pre-release number (e.g. a2, dev8, rc4) then we'll
148+
# if what's in v2 next looks like a pre-release then we'll
189149
# claim v1 is still bigger, otherwise we'll say v2 is.
190-
if re.match("([0-9]+)([a-zA-Z]+)", v2_array[len(v1_array)]):
150+
if v2_array[len(v1_array)] in pre_release_words:
191151
return 1
192152

193153
return -1
@@ -232,4 +192,4 @@ def __ne__(self, other):
232192

233193
def __repr__(self):
234194
"""print the version string"""
235-
return f"Version: {self}"
195+
return f"Version: {self} aka {parse_version(self)}"

experiments/sqlite-experiments.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: GPL-3.0-or-later
3+
4+
"""
5+
A lazy script for searching the database via regexes.
6+
7+
This particular version was used to support the conclusion that my hash detection attempt
8+
in a version_compare PR would do more harm than good, but I'm checking it in so people can modify
9+
it for other data searches in future.
10+
11+
- Terri Oda
12+
"""
13+
14+
import re
15+
import sqlite3
16+
17+
dbcon = sqlite3.connect("/home/terri/.cache/cve-bin-tool/cve.db")
18+
dbcon.create_function("regexp", 2, lambda x, y: 1 if re.search(x, y) else 0)
19+
cursor = dbcon.cursor()
20+
21+
print("StartIncluding ===========")
22+
cursor.execute(
23+
"select vendor, product, versionStartIncluding from cve_range where versionStartIncluding REGEXP '[0-9a-fA-F]{8}'"
24+
)
25+
results = cursor.fetchall()
26+
for i in results:
27+
print(i)
28+
29+
print("StartExcluding ===========")
30+
cursor.execute(
31+
"select vendor, product, versionStartExcluding from cve_range where versionStartExcluding REGEXP '[0-9a-fA-F]{8}'"
32+
)
33+
results = cursor.fetchall()
34+
for i in results:
35+
print(i)
36+
37+
print("EndExcluding ===========")
38+
cursor.execute(
39+
"select vendor, product, versionEndExcluding from cve_range where versionEndExcluding REGEXP '[0-9a-fA-F]{8}'"
40+
)
41+
results = cursor.fetchall()
42+
for i in results:
43+
print(i)
44+
45+
print("EndIncluding ===========")
46+
cursor.execute(
47+
"select vendor, product, versionEndIncluding from cve_range where versionEndIncluding REGEXP '[0-9a-fA-F]{8}'"
48+
)
49+
results = cursor.fetchall()
50+
for i in results:
51+
print(i)

test/test_version_compare.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def test_eq(self):
1515
assert Version("1.1a") == Version("1.1A")
1616
assert Version("4.4.A") == Version("4.4.a")
1717
assert Version("5.6 ") == Version("5.6")
18+
assert Version("f835f2caaa") == Version("f835f2caaa")
1819

1920
def test_lt(self):
2021
"""Make sure < works between versions, including some with unusual version schemes"""
@@ -36,6 +37,11 @@ def test_lt(self):
3637
)
3738
assert Version("1.1.0l.1~deb9u2") < Version("2.0.0-1+deb9u1")
3839
assert Version("1.1.0l.1~deb9u2") < Version("1.1.0m")
40+
assert Version("8.9~deb7u9") < Version("8.9~deb9u6")
41+
assert Version("8.9~deb7u9") < Version("8.9~deb9u6")
42+
assert Version("3.9.pre1") < Version("3.9.u")
43+
assert Version("3.9.rc1") < Version("3.9.g")
44+
assert Version("pre4") < Version("3")
3945

4046
def test_gt(self):
4147
"""Make sure > works between versions, including some with unusual version schemes"""
@@ -53,10 +59,19 @@ def test_gt(self):
5359
"0.0.0.20190813141303.74dc4d7220e7"
5460
)
5561
assert Version("1.1.0m") > Version("1.1.0l.1~deb9u2")
62+
assert Version("8.9~deb9u6") > Version("8.9~deb7u9")
63+
assert Version("3.9.u") > Version("3.9.pre1")
64+
assert Version("3.9.g") > Version("3.9.rc1")
65+
assert Version("2") > Version("pre3")
5666

5767
def test_error(self):
5868
"""Make sure 'unknown' and blank strings raise appropriate errors"""
5969
with pytest.raises(UnknownVersion):
6070
Version("6") > Version("unknown")
6171
with pytest.raises(UnknownVersion):
6272
Version("") > Version("6")
73+
74+
def test_ne(self):
75+
"""Test some != cases with hashes to make sure we aren't comparing the string 'HASH'"""
76+
assert Version("f835f2caab") != Version("f835f2caaa")
77+
assert Version("HASH") != Version("f835f2caaa")

0 commit comments

Comments
 (0)