You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
//Build a forward and reverse map of merged entropy
57
-
//We can derive common substrings from it, as well as high-entropy substring hash functions
58
-
int[]left=newint[maxStr.Length];
59
-
int[]right=newint[maxStr.Length];
60
-
boolflag=true;
56
+
// The code beneath there calculate entropy maps that cna be used to derive the longest common substrings or longest prefix/suffix strings.
57
+
// It works by adding characters to an accumulator, and then potentially removing the value from it again if the characters are the same.
58
+
// If the accumulator for an offset contains 0 after all strings have been accumulated, it is highly likely that all the characters were the same.
59
+
// However, there is a risk that an accumulator is 0, even if the characters are not the same. So we do a sanity check at the end to ensure we did it right.
60
+
61
+
// int[]? map = null;
62
+
int[]?left=null;
63
+
int[]?right=null;
64
+
61
65
boolallAscii=true;
62
66
63
-
foreach(stringstrinkeys)
64
-
{
65
-
for(inti=0;i<str.Length;i++)
67
+
// Special case: If all strings have the same length, we can build an entropy map in O(n) with O(1) memory
68
+
// TODO: For now FastData only supports prefix/suffix
69
+
// if (minLength == maxStr.Length)
70
+
// {
71
+
// map = new int[minLength];
72
+
//
73
+
// foreach (string str in keys)
74
+
// {
75
+
// for (int i = 0; i < str.Length; i++)
76
+
// {
77
+
// char c = str[i];
78
+
// map[i] ^= c;
79
+
//
80
+
// if (c > 127)
81
+
// allAscii = false;
82
+
// }
83
+
// }
84
+
// }
85
+
// else
86
+
// {
87
+
//Build a forward and reverse map of merged entropy
88
+
//We can derive common prefix/suffix from it that can be used later for high-entropy hash/equality functions
89
+
left=newint[maxStr.Length];
90
+
right=newint[maxStr.Length];
91
+
92
+
foreach(stringstrinkeys)
66
93
{
67
-
charc=str[i];
68
-
charrc=str[str.Length-1-i];
94
+
for(inti=0;i<str.Length;i++)
95
+
{
96
+
charlc=str[i];
97
+
charrc=str[str.Length-1-i];
69
98
70
-
left[i]+=flag?c:-c;
71
-
right[i]+=flag?rc:-rc;
99
+
left[i]^=lc;
100
+
right[i]^=rc;
72
101
73
-
if(c>127)
74
-
allAscii=false;
102
+
if(lc>127)
103
+
allAscii=false;
104
+
}
75
105
}
76
106
77
-
flag=!flag;
78
-
}
79
-
80
-
//Odd number of items. We need it to be even
81
-
if(keys.Length%2!=0)
82
-
{
83
-
for(inti=0;i<maxStr.Length;i++)
107
+
//Odd number of items. We need it to be even
108
+
if(keys.Length%2!=0)
84
109
{
85
-
//For best mixing, we take the longest string
86
-
charc=maxStr[i];
87
-
charrc=maxStr[maxStr.Length-1-i];
110
+
for(inti=0;i<maxStr.Length;i++)
111
+
{
112
+
//For best mixing, we take the longest string
113
+
charlc=maxStr[i];
114
+
charrc=maxStr[maxStr.Length-1-i];
88
115
89
-
left[i]+=flag?c:-c;
90
-
right[i]+=flag?rc:-rc;
116
+
left[i]^=lc;
117
+
right[i]^=rc;
91
118
92
-
//We do not add to characterMap here since it does not need the duplicate
119
+
//We do not add to characterMap here since it does not need the duplicate
0 commit comments