Skip to content

Commit 1619046

Browse files
committed
Rewrite comparison code to fit C# better, add tests
1 parent 4fda55d commit 1619046

File tree

2 files changed

+41
-58
lines changed

2 files changed

+41
-58
lines changed

SabreTools.Hashing.Test/SpamSumTests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ public class SpamSumTests
2626
[InlineData("24576:fCQxhkAcV6cUdRxczoyVQQFDSVRNihk24vXDj20sq:6Q/q6bazwMgRNihk24jtsq", "24576:p+QxhkAcV6cUdRxczoy3NmO0ne3HFVjSeQ229SVjeONr+v:YQ/q6baz5Nqe3H2eQzStBa", 54)]
2727
// Duplicate sequence truncation
2828
[InlineData("500:AAAAAAAAAAAAAAAAAAAAAAAAyENFACBE+rW6Tj7SMQmK:4", "500:AAAyENFACBE+rW6Tj7SMQmK:4", 100)]
29+
// Trailing data ignored
30+
[InlineData("6:l+lq/MtlM8pJ0gt6lXWogE61UlT1Uqj1akMD5n:l+l6Mtl/n0gtOXmEuUl5UqpakM9n,ANYTHING", "6:mTj3qJskr+V+1o21+n0rtD2noPWKlAyjllZmMt6120EK+wlsS6T1oLwXuk4tk7:m/bk/1oQrJL3jTu20EK+wlsp5oO4tk7,NOTHING", 0)]
31+
[InlineData("6:mTj3qJskr+V+1o21+n0rtD2noPWKlAyjllZmMt6120EK+wlsS6T1oLwXuk4tk7:m/bk/1oQrJL3jTu20EK+wlsp5oO4tk7,NOTHING", "6:l+lq/MtlM8pJ0gt6lXWogE61UlT1Uqj1akMD5n:l+l6Mtl/n0gtOXmEuUl5UqpakM9n,ANYTHING", 0)]
2932
// Rolling window - larger than 7
3033
[InlineData("500:7SMQmKa:3", "500:7SMQmKr:3", 0)]
3134
// Rolling window - smaller than 7

SabreTools.Hashing/SpamSum/Comparisons.cs

Lines changed: 38 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ namespace SabreTools.Hashing.SpamSum;
55

66
internal static class Comparisons
77
{
8+
/// <summary>
9+
/// Regex to reduce any sequences longer than 3
10+
/// </summary>
11+
private static Regex _reduceRegex = new("(.)(?<=\\1\\1\\1\\1)", RegexOptions.Compiled);
12+
813
/// <summary>
914
/// Compares how similar two SpamSums are to each other
1015
/// </summary>
@@ -15,86 +20,61 @@ internal static class Comparisons
1520
/// <see href="https://github.com/ssdeep-project/ssdeep/blob/df3b860f8918261b3faeec9c7d2c8a241089e6e6/fuzzy.c#L860"/>
1621
public static int FuzzyCompare(string? first, string? second)
1722
{
18-
if (first == null || second == null)
23+
// If either input is invalid
24+
if (string.IsNullOrEmpty(first) || string.IsNullOrEmpty(second))
1925
return -1;
2026

21-
// Each SpamSum string starts with its block size before the first semicolon.
22-
// Verify it's there and return otherwise.
27+
// Split the string into 3 parts for processing
28+
var firstSplit = first!.Split(':');
29+
var secondSplit = second!.Split(':');
30+
if (firstSplit.Length != 3 || secondSplit.Length != 3)
31+
return -1;
2332

24-
int firstPrefixIndex = first.IndexOf(':');
25-
if (firstPrefixIndex == -1)
33+
// If any of the required parts are empty
34+
if (firstSplit[0].Length == 0 || firstSplit[2].Length == 0)
2635
return -1;
27-
if (!uint.TryParse(first.Substring(0, firstPrefixIndex), out uint firstBlockSize))
36+
if (secondSplit[0].Length == 0 || secondSplit[2].Length == 0)
2837
return -1;
2938

30-
int secondPrefixIndex = second.IndexOf(':');
31-
if (secondPrefixIndex == -1)
39+
// Each SpamSum string starts with its block size before the first semicolon.
40+
if (!uint.TryParse(firstSplit[0], out uint firstBlockSize))
3241
return -1;
33-
if (!uint.TryParse(second.Substring(0, secondPrefixIndex), out uint secondBlockSize))
42+
if (!uint.TryParse(secondSplit[0], out uint secondBlockSize))
3443
return -1;
3544

36-
// Check if blocksizes don't match. Each spamSum is broken up into two blocks. fuzzy_compare allows you to
37-
// compare if one block in one hash is the same size as one block in the other hash, even if the other two are
38-
// non-matching, so that's also checked for.
39-
if (firstBlockSize != secondBlockSize &&
40-
(firstBlockSize > uint.MaxValue / 2 || firstBlockSize * 2 != secondBlockSize) &&
41-
(firstBlockSize % 2 == 1 || firstBlockSize / 2 != secondBlockSize))
45+
// Check if blocksizes don't match. Each spamSum is broken up into two blocks.
46+
// fuzzy_compare allows you to compare if one block in one hash is the same
47+
// size as one block in the other hash, even if the other two are non-matching,
48+
// so that's also checked for.
49+
if (firstBlockSize != secondBlockSize
50+
&& (firstBlockSize > uint.MaxValue / 2 || firstBlockSize * 2 != secondBlockSize)
51+
&& (firstBlockSize % 2 == 1 || firstBlockSize / 2 != secondBlockSize))
4252
{
4353
return 0;
4454
}
4555

46-
// Get the spamSum strings starting past the blocksize prefix.
47-
first = first.Substring(firstPrefixIndex + 1);
48-
second = second.Substring(secondPrefixIndex + 1);
49-
50-
// Make sure there's something there
51-
if (string.IsNullOrEmpty(first) || string.IsNullOrEmpty(second))
52-
return -1;
53-
54-
// Split each spamSum into two blocks.
55-
// Unclear why the second blocks must end before commas, but it is what fuzzy_compare does.
56-
// If a spamSum doesn't have two parts past the prefix, it's malformed and must be returned.
56+
// Ensure only second block data before a comma is used
57+
string firstBlockTwo = firstSplit[2].Split(',')[0];
58+
string secondBlockTwo = secondSplit[2].Split(',')[0];
5759

58-
var tempSplit = first.Split(':');
59-
var firstBlockOne = tempSplit[0];
60-
if (tempSplit.Length == 1 || string.IsNullOrEmpty(tempSplit[1]))
61-
return -1;
62-
var firstBlockTwo = tempSplit[1].Split(',')[0];
63-
64-
tempSplit = second.Split(':');
65-
var secondBlockOne = tempSplit[0];
66-
if (tempSplit.Length == 1 || string.IsNullOrEmpty(tempSplit[1]))
67-
return -1;
68-
var secondBlockTwo = tempSplit[1].Split(',')[0];
69-
70-
// The comments for fuzzy_compare say to "Eliminate any sequences [of the same character] longer than 3".
71-
// What this actually means is that any sequences of the same character longer than 3 need to be reduced to size 3,
72-
// i.e. "9AgX87HAAAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf" becomes "9AgX87HAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf"
73-
// The reason for doing this is that these sequences contain very little info, so cutting them down helps with
74-
// part of scoring the strings later.
75-
var r = new Regex("(.)(?<=\\1\\1\\1\\1)", RegexOptions.Compiled);
76-
77-
firstBlockOne = r.Replace(firstBlockOne, string.Empty);
78-
firstBlockTwo = r.Replace(firstBlockTwo, string.Empty);
79-
secondBlockOne = r.Replace(secondBlockOne, string.Empty);
80-
secondBlockTwo = r.Replace(secondBlockTwo, string.Empty);
60+
// Reduce any sequences longer than 3
61+
// These sequences contain very little info and can be reduced as a result
62+
string firstBlockOne = _reduceRegex.Replace(firstSplit[1], string.Empty);
63+
firstBlockTwo = _reduceRegex.Replace(firstBlockTwo, string.Empty);
64+
string secondBlockOne = _reduceRegex.Replace(secondSplit[1], string.Empty);
65+
secondBlockTwo = _reduceRegex.Replace(secondBlockTwo, string.Empty);
8166

8267
// Return 100 immediately if both spamSums are identical.
83-
if (firstBlockSize == secondBlockSize
84-
&& firstBlockOne.Length == secondBlockOne.Length
85-
&& firstBlockTwo.Length == secondBlockTwo.Length)
86-
{
87-
if (firstBlockOne == secondBlockOne && firstBlockTwo == secondBlockTwo)
88-
return 100;
89-
}
68+
if (firstBlockSize == secondBlockSize && firstBlockOne == secondBlockOne && firstBlockTwo == secondBlockTwo)
69+
return 100;
9070

9171
// Choose different scoring combinations depending on block sizes present.
9272
if (firstBlockSize <= uint.MaxValue / 2)
9373
{
9474
if (firstBlockSize == secondBlockSize)
9575
{
96-
var score1 = ScoreStrings(firstBlockOne, secondBlockOne, firstBlockSize);
97-
var score2 = ScoreStrings(firstBlockTwo, secondBlockTwo, firstBlockSize * 2);
76+
uint score1 = ScoreStrings(firstBlockOne, secondBlockOne, firstBlockSize);
77+
uint score2 = ScoreStrings(firstBlockTwo, secondBlockTwo, firstBlockSize * 2);
9878
return (int)Math.Max(score1, score2);
9979
}
10080
else if (firstBlockSize * 2 == secondBlockSize)

0 commit comments

Comments
 (0)