Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 5d7b3f9

Browse files
author
Viktor Hofer
committed
Merged PR 100702: Move to marvin hashing for OrdinalIgnoreCase CompareInfo & StringComparers
Move to marvin hashing for OrdinalIgnoreCase CompareInfo & StringComparers.
2 parents fcdc1ec + 09db064 commit 5d7b3f9

File tree

7 files changed

+76
-69
lines changed

7 files changed

+76
-69
lines changed

src/classlibnative/bcltype/stringnative.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,25 @@ FCIMPL3(INT32, COMString::Marvin32HashString, StringObject* thisRefUNSAFE, INT32
182182
}
183183
FCIMPLEND
184184

185+
FCIMPL2(INT32, COMString::Marvin32HashPtr, WCHAR *pRawStr, INT32 strLen) {
186+
FCALL_CONTRACT;
187+
188+
int iReturnHash = 0;
189+
190+
if (pRawStr == NULL) {
191+
FCThrow(kNullReferenceException);
192+
}
193+
194+
BEGIN_SO_INTOLERANT_CODE_NOTHROW(GetThread(), FCThrow(kStackOverflowException));
195+
iReturnHash = GetCurrentNlsHashProvider()->HashString(pRawStr, strLen, TRUE, 0);
196+
END_SO_INTOLERANT_CODE;
197+
198+
FC_GC_POLL_RET();
199+
200+
return iReturnHash;
201+
}
202+
FCIMPLEND
203+
185204
BOOL QCALLTYPE COMString::UseRandomizedHashing() {
186205
QCALL_CONTRACT;
187206

src/classlibnative/bcltype/stringnative.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class COMString {
8787

8888
#ifdef FEATURE_RANDOMIZED_STRING_HASHING
8989
static FCDECL3(INT32, Marvin32HashString, StringObject* thisRefUNSAFE, INT32 strLen, INT64 additionalEntropy);
90+
static FCDECL2(INT32, Marvin32HashPtr, WCHAR *pRawStr, INT32 strLen);
9091
static BOOL QCALLTYPE UseRandomizedHashing();
9192
#endif // FEATURE_RANDOMIZED_STRING_HASHING
9293

src/mscorlib/shared/System/StringComparer.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ public override int GetHashCode(string obj)
287287

288288
if (_ignoreCase)
289289
{
290-
return TextInfo.GetHashCodeOrdinalIgnoreCase(obj);
290+
return CompareInfo.Invariant.GetHashCode(obj, CompareOptions.OrdinalIgnoreCase);
291291
}
292292

293293
return obj.GetHashCode();
@@ -363,7 +363,7 @@ public override int GetHashCode(string obj)
363363
throw new ArgumentNullException(nameof(obj));
364364
#endif
365365
}
366-
return TextInfo.GetHashCodeOrdinalIgnoreCase(obj);
366+
return CompareInfo.Invariant.GetHashCode(obj, CompareOptions.OrdinalIgnoreCase);
367367
}
368368

369369
public void GetObjectData(SerializationInfo info, StreamingContext context)

src/mscorlib/src/System/Globalization/CompareInfo.cs

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ public partial class CompareInfo : IDeserializationCallback
5757
~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace |
5858
CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort);
5959

60+
// We cache the invariant compareinfo as we need it for OrdinalIgnoreCase hashing
61+
internal static readonly CompareInfo Invariant = CultureInfo.InvariantCulture.CompareInfo;
62+
6063
//
6164
// CompareInfos have an interesting identity. They are attached to the locale that created them,
6265
// ie: en-US would have an en-US sort. For haw-US (custom), then we serialize it as haw-US.
@@ -1181,21 +1184,67 @@ internal int GetHashCodeOfString(string source, CompareOptions options)
11811184
return GetHashCodeOfStringCore(source, options);
11821185
}
11831186

1187+
private unsafe int GetSmallAsciiStringHash(string source)
1188+
{
1189+
Debug.Assert(source.Length <= 250, "Input string is too long");
1190+
1191+
// Do not allocate on the stack if string is empty
1192+
if (source.Length == 0)
1193+
{
1194+
return source.GetHashCode();
1195+
}
1196+
1197+
char* charArr = stackalloc char[source.Length];
1198+
char c;
1199+
for (int i = 0; i < source.Length; i++)
1200+
{
1201+
c = source[i];
1202+
1203+
// If we have a lowercase character, ANDing off 0x20
1204+
// will make it an uppercase character.
1205+
if ((c - 'a') <= ('z' - 'a'))
1206+
{
1207+
c = (char)(c & ~0x20);
1208+
}
1209+
1210+
charArr[i] = c;
1211+
}
1212+
1213+
return String.InternalMarvin32HashPtr(charArr, source.Length);
1214+
}
1215+
11841216
public virtual int GetHashCode(string source, CompareOptions options)
11851217
{
11861218
if (source == null)
11871219
{
11881220
throw new ArgumentNullException(nameof(source));
11891221
}
11901222

1223+
if (_invariantMode)
1224+
{
1225+
// If invariant mode enabled we ignore all compare options except *IgnoreCase.
1226+
if ((options & (CompareOptions.IgnoreCase | CompareOptions.OrdinalIgnoreCase)) != 0)
1227+
{
1228+
// For small strings we allocate on the stack
1229+
if (source.Length <= 250)
1230+
return GetSmallAsciiStringHash(source);
1231+
else
1232+
return TextInfo.Invariant.ToUpper(source).GetHashCode();
1233+
}
1234+
1235+
return source.GetHashCode();
1236+
}
1237+
11911238
if (options == CompareOptions.Ordinal)
11921239
{
11931240
return source.GetHashCode();
11941241
}
11951242

11961243
if (options == CompareOptions.OrdinalIgnoreCase)
11971244
{
1198-
return TextInfo.GetHashCodeOrdinalIgnoreCase(source);
1245+
// We use native marvin hashing to avoid hash collisions. We are passing
1246+
// IgnoreCase as GetHashCodeOfStringCore can't handle OrdinalIgnoreCase.
1247+
return Invariant.GetHashCodeOfStringCore(source, CompareOptions.IgnoreCase);
11991248
}
12001249

12011250
//

src/mscorlib/src/System/Globalization/TextInfo.cs

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,6 @@ void IDeserializationCallback.OnDeserialization(Object sender)
101101
// Internal ordinal comparison functions
102102
//
103103

104-
internal static int GetHashCodeOrdinalIgnoreCase(String s)
105-
{
106-
// This is the same as an case insensitive hash for Invariant
107-
// (not necessarily true for sorting, but OK for casing & then we apply normal hash code rules)
108-
return (Invariant.GetCaseInsensitiveHashCode(s));
109-
}
110-
111104
// Currently we don't have native functions to do this, so we do it the hard way
112105
internal static int IndexOfStringOrdinalIgnoreCase(String source, String value, int startIndex, int count)
113106
{
@@ -789,64 +782,5 @@ private static bool IsLetterCategory(UnicodeCategory uc)
789782
|| uc == UnicodeCategory.ModifierLetter
790783
|| uc == UnicodeCategory.OtherLetter);
791784
}
792-
793-
//
794-
// Get case-insensitive hash code for the specified string.
795-
//
796-
internal unsafe int GetCaseInsensitiveHashCode(String str)
797-
{
798-
// Validate inputs
799-
if (str == null)
800-
{
801-
throw new ArgumentNullException(nameof(str));
802-
}
803-
804-
// This code assumes that ASCII casing is safe for whatever context is passed in.
805-
// this is true today, because we only ever call these methods on Invariant. It would be ideal to refactor
806-
// these methods so they were correct by construction and we could only ever use Invariant.
807-
808-
uint hash = 5381;
809-
uint c;
810-
811-
// Note: We assume that str contains only ASCII characters until
812-
// we hit a non-ASCII character to optimize the common case.
813-
for (int i = 0; i < str.Length; i++)
814-
{
815-
c = str[i];
816-
if (c >= 0x80)
817-
{
818-
return GetCaseInsensitiveHashCodeSlow(str);
819-
}
820-
821-
// If we have a lowercase character, ANDing off 0x20
822-
// will make it an uppercase character.
823-
if ((c - 'a') <= ('z' - 'a'))
824-
{
825-
c = (uint)((int)c & ~0x20);
826-
}
827-
828-
hash = ((hash << 5) + hash) ^ c;
829-
}
830-
831-
return (int)hash;
832-
}
833-
834-
private unsafe int GetCaseInsensitiveHashCodeSlow(String str)
835-
{
836-
Debug.Assert(str != null);
837-
838-
string upper = ToUpper(str);
839-
840-
uint hash = 5381;
841-
uint c;
842-
843-
for (int i = 0; i < upper.Length; i++)
844-
{
845-
c = upper[i];
846-
hash = ((hash << 5) + hash) ^ c;
847-
}
848-
849-
return (int)hash;
850-
}
851785
}
852786
}

src/mscorlib/src/System/String.Comparison.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,9 @@ public static bool Equals(String a, String b, StringComparison comparisonType)
10141014
[MethodImplAttribute(MethodImplOptions.InternalCall)]
10151015
internal static extern int InternalMarvin32HashString(string s, int strLen, long additionalEntropy);
10161016

1017+
[MethodImplAttribute(MethodImplOptions.InternalCall)]
1018+
internal unsafe static extern int InternalMarvin32HashPtr(char* s, int strLen);
1019+
10171020
internal static bool UseRandomizedHashing()
10181021
{
10191022
return InternalUseRandomizedHashing();

src/vm/ecalllist.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ FCFuncStart(gStringFuncs)
127127
#endif // FEATURE_COMINTEROP
128128
#ifdef FEATURE_RANDOMIZED_STRING_HASHING
129129
FCFuncElement("InternalMarvin32HashString", COMString::Marvin32HashString)
130+
FCFuncElement("InternalMarvin32HashPtr", COMString::Marvin32HashPtr)
130131
QCFuncElement("InternalUseRandomizedHashing", COMString::UseRandomizedHashing)
131132
#endif // FEATURE_RANDOMIZED_STRING_HASHING
132133
FCFuncEnd()

0 commit comments

Comments
 (0)