@@ -5,6 +5,11 @@ namespace SabreTools.Hashing.SpamSum;
55
66internal static class Comparisons
77{
8+ /// <summary>
9+ /// Regex to reduce any sequences longer than 3
10+ /// </summary>
11+ private static Regex _reduceRegex = new ( "(.)(?<=\\ 1\\ 1\\ 1\\ 1)" , RegexOptions . Compiled ) ;
12+
813 /// <summary>
914 /// Compares how similar two SpamSums are to each other
1015 /// </summary>
@@ -15,86 +20,61 @@ internal static class Comparisons
1520 /// <see href="https://github.com/ssdeep-project/ssdeep/blob/df3b860f8918261b3faeec9c7d2c8a241089e6e6/fuzzy.c#L860"/>
1621 public static int FuzzyCompare ( string ? first , string ? second )
1722 {
18- if ( first == null || second == null )
23+ // If either input is invalid
24+ if ( string . IsNullOrEmpty ( first ) || string . IsNullOrEmpty ( second ) )
1925 return - 1 ;
2026
21- // Each SpamSum string starts with its block size before the first semicolon.
22- // Verify it's there and return otherwise.
27+ // Split the string into 3 parts for processing
28+ var firstSplit = first ! . Split ( ':' ) ;
29+ var secondSplit = second ! . Split ( ':' ) ;
30+ if ( firstSplit . Length != 3 || secondSplit . Length != 3 )
31+ return - 1 ;
2332
24- int firstPrefixIndex = first . IndexOf ( ':' ) ;
25- if ( firstPrefixIndex == - 1 )
33+ // If any of the required parts are empty
34+ if ( firstSplit [ 0 ] . Length == 0 || firstSplit [ 2 ] . Length == 0 )
2635 return - 1 ;
27- if ( ! uint . TryParse ( first . Substring ( 0 , firstPrefixIndex ) , out uint firstBlockSize ) )
36+ if ( secondSplit [ 0 ] . Length == 0 || secondSplit [ 2 ] . Length == 0 )
2837 return - 1 ;
2938
30- int secondPrefixIndex = second . IndexOf ( ':' ) ;
31- if ( secondPrefixIndex == - 1 )
39+ // Each SpamSum string starts with its block size before the first semicolon.
40+ if ( ! uint . TryParse ( firstSplit [ 0 ] , out uint firstBlockSize ) )
3241 return - 1 ;
33- if ( ! uint . TryParse ( second . Substring ( 0 , secondPrefixIndex ) , out uint secondBlockSize ) )
42+ if ( ! uint . TryParse ( secondSplit [ 0 ] , out uint secondBlockSize ) )
3443 return - 1 ;
3544
36- // Check if blocksizes don't match. Each spamSum is broken up into two blocks. fuzzy_compare allows you to
37- // compare if one block in one hash is the same size as one block in the other hash, even if the other two are
38- // non-matching, so that's also checked for.
39- if ( firstBlockSize != secondBlockSize &&
40- ( firstBlockSize > uint . MaxValue / 2 || firstBlockSize * 2 != secondBlockSize ) &&
41- ( firstBlockSize % 2 == 1 || firstBlockSize / 2 != secondBlockSize ) )
45+ // Check if blocksizes don't match. Each spamSum is broken up into two blocks.
46+ // fuzzy_compare allows you to compare if one block in one hash is the same
47+ // size as one block in the other hash, even if the other two are non-matching,
48+ // so that's also checked for.
49+ if ( firstBlockSize != secondBlockSize
50+ && ( firstBlockSize > uint . MaxValue / 2 || firstBlockSize * 2 != secondBlockSize )
51+ && ( firstBlockSize % 2 == 1 || firstBlockSize / 2 != secondBlockSize ) )
4252 {
4353 return 0 ;
4454 }
4555
46- // Get the spamSum strings starting past the blocksize prefix.
47- first = first . Substring ( firstPrefixIndex + 1 ) ;
48- second = second . Substring ( secondPrefixIndex + 1 ) ;
49-
50- // Make sure there's something there
51- if ( string . IsNullOrEmpty ( first ) || string . IsNullOrEmpty ( second ) )
52- return - 1 ;
53-
54- // Split each spamSum into two blocks.
55- // Unclear why the second blocks must end before commas, but it is what fuzzy_compare does.
56- // If a spamSum doesn't have two parts past the prefix, it's malformed and must be returned.
56+ // Ensure only second block data before a comma is used
57+ string firstBlockTwo = firstSplit [ 2 ] . Split ( ',' ) [ 0 ] ;
58+ string secondBlockTwo = secondSplit [ 2 ] . Split ( ',' ) [ 0 ] ;
5759
58- var tempSplit = first . Split ( ':' ) ;
59- var firstBlockOne = tempSplit [ 0 ] ;
60- if ( tempSplit . Length == 1 || string . IsNullOrEmpty ( tempSplit [ 1 ] ) )
61- return - 1 ;
62- var firstBlockTwo = tempSplit [ 1 ] . Split ( ',' ) [ 0 ] ;
63-
64- tempSplit = second . Split ( ':' ) ;
65- var secondBlockOne = tempSplit [ 0 ] ;
66- if ( tempSplit . Length == 1 || string . IsNullOrEmpty ( tempSplit [ 1 ] ) )
67- return - 1 ;
68- var secondBlockTwo = tempSplit [ 1 ] . Split ( ',' ) [ 0 ] ;
69-
70- // The comments for fuzzy_compare say to "Eliminate any sequences [of the same character] longer than 3".
71- // What this actually means is that any sequences of the same character longer than 3 need to be reduced to size 3,
72- // i.e. "9AgX87HAAAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf" becomes "9AgX87HAAAOKG5/Dqj3C2o/jlqW7Yn/nmcwlcKCwA9aJo9FcAKwf"
73- // The reason for doing this is that these sequences contain very little info, so cutting them down helps with
74- // part of scoring the strings later.
75- var r = new Regex ( "(.)(?<=\\ 1\\ 1\\ 1\\ 1)" , RegexOptions . Compiled ) ;
76-
77- firstBlockOne = r . Replace ( firstBlockOne , string . Empty ) ;
78- firstBlockTwo = r . Replace ( firstBlockTwo , string . Empty ) ;
79- secondBlockOne = r . Replace ( secondBlockOne , string . Empty ) ;
80- secondBlockTwo = r . Replace ( secondBlockTwo , string . Empty ) ;
60+ // Reduce any sequences longer than 3
61+ // These sequences contain very little info and can be reduced as a result
62+ string firstBlockOne = _reduceRegex . Replace ( firstSplit [ 1 ] , string . Empty ) ;
63+ firstBlockTwo = _reduceRegex . Replace ( firstBlockTwo , string . Empty ) ;
64+ string secondBlockOne = _reduceRegex . Replace ( secondSplit [ 1 ] , string . Empty ) ;
65+ secondBlockTwo = _reduceRegex . Replace ( secondBlockTwo , string . Empty ) ;
8166
8267 // Return 100 immediately if both spamSums are identical.
83- if ( firstBlockSize == secondBlockSize
84- && firstBlockOne . Length == secondBlockOne . Length
85- && firstBlockTwo . Length == secondBlockTwo . Length )
86- {
87- if ( firstBlockOne == secondBlockOne && firstBlockTwo == secondBlockTwo )
88- return 100 ;
89- }
68+ if ( firstBlockSize == secondBlockSize && firstBlockOne == secondBlockOne && firstBlockTwo == secondBlockTwo )
69+ return 100 ;
9070
9171 // Choose different scoring combinations depending on block sizes present.
9272 if ( firstBlockSize <= uint . MaxValue / 2 )
9373 {
9474 if ( firstBlockSize == secondBlockSize )
9575 {
96- var score1 = ScoreStrings ( firstBlockOne , secondBlockOne , firstBlockSize ) ;
97- var score2 = ScoreStrings ( firstBlockTwo , secondBlockTwo , firstBlockSize * 2 ) ;
76+ uint score1 = ScoreStrings ( firstBlockOne , secondBlockOne , firstBlockSize ) ;
77+ uint score2 = ScoreStrings ( firstBlockTwo , secondBlockTwo , firstBlockSize * 2 ) ;
9878 return ( int ) Math . Max ( score1 , score2 ) ;
9979 }
10080 else if ( firstBlockSize * 2 == secondBlockSize )
0 commit comments