Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 80800eb

Browse files
committed
Improve string.{Last}IndexOf perf on Unix for Ordinal/OrdinalIgnoreCase
Our current implementation of IndexOfOrdinal for strings on Unix uses Substring to get the piece of the source string we care about; this results in an unnecessary allocation / string copy. When using OrdinalIgnoreCase, we also convert both the source and search strings to upper-case using ToUpperInvariant, resulting in more allocations. And our LastIndexOfOrdinal implementation delegates to IndexOfOrdinal repeatedly, incurring such allocations potentially multiple times. This change reimplements Ordinal searching in managed code to not use Substring, and it implements OrdinalIgnoreCase searching via new functions exposed in the native globalization shim, so as to use ICU without having to make managed/native transitions for each character. With the changes, {Last}IndexOf with Ordinal/OrdinalIgnoreCase are now allocateion-free (as you'd expect), and throughput when startIndex/count and/or OrdinalIgnoreCase are used is increased significantly, on my machine anywhere from 20% to 3x, depending on the inputs.
1 parent 3ddea17 commit 80800eb

File tree

3 files changed

+135
-29
lines changed

3 files changed

+135
-29
lines changed

src/corefx/System.Globalization.Native/collation.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,84 @@ extern "C" int32_t LastIndexOf(
125125
return result;
126126
}
127127

128+
/*
129+
Static Function:
130+
AreEqualOrdinalIgnoreCase
131+
*/
132+
static bool AreEqualOrdinalIgnoreCase(UChar one, UChar two)
133+
{
134+
// Return whether the two characters are identical or would be identical if they were upper-cased.
135+
136+
if (one == two)
137+
{
138+
return true;
139+
}
140+
141+
if (one == 0x0131 || two == 0x0131)
142+
{
143+
// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
144+
// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
145+
// We special case it to match the Windows invariant behavior.
146+
return false;
147+
}
148+
149+
return u_toupper(one) == u_toupper(two);
150+
}
151+
152+
/*
153+
Function:
154+
IndexOfOrdinalIgnoreCase
155+
*/
156+
extern "C" int32_t
157+
IndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength)
158+
{
159+
int32_t endIndex = cwSourceLength - cwTargetLength;
160+
assert(endIndex >= 0);
161+
162+
for (int32_t i = 0; i <= endIndex; i++)
163+
{
164+
int32_t targetIdx = 0;
165+
for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) {
166+
if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) {
167+
break;
168+
}
169+
}
170+
171+
if (targetIdx == cwTargetLength) {
172+
return i;
173+
}
174+
}
175+
176+
return -1;
177+
}
178+
179+
/*
180+
Function:
181+
LastIndexOfOrdinalIgnoreCase
182+
*/
183+
extern "C" int32_t
184+
LastIndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength)
185+
{
186+
int32_t endIndex = cwSourceLength - cwTargetLength;
187+
assert(endIndex >= 0);
188+
189+
for (int32_t i = endIndex; i >= 0; i--)
190+
{
191+
int32_t targetIdx = 0;
192+
for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) {
193+
if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) {
194+
break;
195+
}
196+
}
197+
198+
if (targetIdx == cwTargetLength) {
199+
return i;
200+
}
201+
}
202+
203+
return -1;
204+
}
205+
128206
/*
129207
Return value is a "Win32 BOOL" (1 = true, 0 = false)
130208
*/

src/mscorlib/corefx/Interop/Unix/System.Globalization.Native/Interop.Collation.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ internal static partial class GlobalizationInterop
1818
[DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)]
1919
internal unsafe static extern int LastIndexOf(byte[] localeName, string target, char* pSource, int cwSourceLength, CompareOptions options);
2020

21+
[DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)]
22+
internal unsafe static extern int IndexOfOrdinalIgnoreCase(string target, int cwTargetLength, char* pSource, int cwSourceLength);
23+
24+
[DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)]
25+
internal unsafe static extern int LastIndexOfOrdinalIgnoreCase(string target, int cwTargetLength, char* pSource, int cwSourceLength);
26+
2127
[DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)]
2228
[return: MarshalAs(UnmanagedType.Bool)]
2329
internal unsafe static extern bool StartsWith(byte[] localeName, string target, string source, int cwSourceLength, CompareOptions options);

src/mscorlib/corefx/System/Globalization/CompareInfo.Unix.cs

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ internal unsafe CompareInfo(CultureInfo culture)
2020
m_sortNameAsUtf8 = System.Text.Encoding.UTF8.GetBytes(m_sortName);
2121
}
2222

23-
internal static int IndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase)
23+
internal static unsafe int IndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase)
2424
{
2525
Contract.Assert(source != null);
2626
Contract.Assert(value != null);
@@ -30,33 +30,41 @@ internal static int IndexOfOrdinal(string source, string value, int startIndex,
3030
return startIndex;
3131
}
3232

33-
// TODO (dotnet/corefx#3468): Move this into the shim so we don't have to do the ToUpper or call substring.
33+
if (count < value.Length)
34+
{
35+
return -1;
36+
}
3437

3538
if (ignoreCase)
3639
{
37-
source = source.ToUpper(CultureInfo.InvariantCulture);
38-
value = value.ToUpper(CultureInfo.InvariantCulture);
40+
fixed (char* pSource = source)
41+
{
42+
int index = Interop.GlobalizationInterop.IndexOfOrdinalIgnoreCase(value, value.Length, pSource + startIndex, count);
43+
return index != -1 ?
44+
startIndex + index :
45+
-1;
46+
}
3947
}
4048

41-
source = source.Substring(startIndex, count);
42-
43-
for (int i = 0; i + value.Length <= source.Length; i++)
49+
int endIndex = startIndex + (count - value.Length);
50+
for (int i = startIndex; i <= endIndex; i++)
4451
{
45-
for (int j = 0; j < value.Length; j++) {
46-
if (source[i + j] != value[j]) {
47-
break;
48-
}
52+
int valueIndex, sourceIndex;
53+
54+
for (valueIndex = 0, sourceIndex = i;
55+
valueIndex < value.Length && source[sourceIndex] == value[valueIndex];
56+
valueIndex++, sourceIndex++) ;
4957

50-
if (j == value.Length - 1) {
51-
return i + startIndex;
52-
}
58+
if (valueIndex == value.Length)
59+
{
60+
return i;
5361
}
5462
}
5563

5664
return -1;
5765
}
5866

59-
internal static int LastIndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase)
67+
internal static unsafe int LastIndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase)
6068
{
6169
Contract.Assert(source != null);
6270
Contract.Assert(value != null);
@@ -66,27 +74,41 @@ internal static int LastIndexOfOrdinal(string source, string value, int startInd
6674
return startIndex;
6775
}
6876

69-
// TODO (dotnet/corefx#3468): Move this into the shim so we don't have to do the ToUpper or call substring.
77+
if (count < value.Length)
78+
{
79+
return -1;
80+
}
81+
82+
// startIndex is the index into source where we start search backwards from.
83+
// leftStartIndex is the index into source of the start of the string that is
84+
// count characters away from startIndex.
85+
int leftStartIndex = startIndex - count + 1;
7086

7187
if (ignoreCase)
7288
{
73-
source = source.ToUpper(CultureInfo.InvariantCulture);
74-
value = value.ToUpper(CultureInfo.InvariantCulture);
89+
fixed (char* pSource = source)
90+
{
91+
int lastIndex = Interop.GlobalizationInterop.LastIndexOfOrdinalIgnoreCase(value, value.Length, pSource + leftStartIndex, count);
92+
return lastIndex != -1 ?
93+
leftStartIndex + lastIndex :
94+
-1;
95+
}
7596
}
7697

77-
source = source.Substring(startIndex - count + 1, count);
98+
for (int i = startIndex - value.Length + 1; i >= leftStartIndex; i--)
99+
{
100+
int valueIndex, sourceIndex;
78101

79-
int last = -1;
102+
for (valueIndex = 0, sourceIndex = i;
103+
valueIndex < value.Length && source[sourceIndex] == value[valueIndex];
104+
valueIndex++, sourceIndex++) ;
80105

81-
int cur = 0;
82-
while ((cur = IndexOfOrdinal(source, value, last + 1, source.Length - last - 1, false)) != -1)
83-
{
84-
last = cur;
106+
if (valueIndex == value.Length) {
107+
return i;
108+
}
85109
}
86110

87-
return last >= 0 ?
88-
last + startIndex - count + 1 :
89-
-1;
111+
return -1;
90112
}
91113

92114
private unsafe int GetHashCodeOfStringCore(string source, CompareOptions options)
@@ -138,9 +160,9 @@ private unsafe int IndexOfCore(string source, string target, int startIndex, int
138160

139161
fixed (char* pSource = source)
140162
{
141-
int lastIndex = Interop.GlobalizationInterop.IndexOf(m_sortNameAsUtf8, target, pSource + startIndex, count, options);
163+
int index = Interop.GlobalizationInterop.IndexOf(m_sortNameAsUtf8, target, pSource + startIndex, count, options);
142164

143-
return lastIndex != -1 ? lastIndex + startIndex : -1;
165+
return index != -1 ? index + startIndex : -1;
144166
}
145167
}
146168

0 commit comments

Comments
 (0)