Skip to content

Commit e298c75

Browse files
committed
Fix countCodePoints range scan for non-zero offsets
countCodePoints(Slice, offset, length) mixed absolute and relative bounds when chunk-scanning, so non-zero offsets could scan the wrong byte window and return incorrect code point counts.
1 parent a994cc0 commit e298c75

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

src/main/java/io/airlift/slice/SliceUtf8.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -120,22 +120,22 @@ public static int countCodePoints(Slice utf8, int offset, int length)
120120
return 0;
121121
}
122122

123+
int end = offset + length;
123124
int continuationBytesCount = 0;
124-
// Length rounded to 8 bytes
125-
int length8 = length & 0x7FFF_FFF8;
126-
for (; offset < length8; offset += 8) {
125+
int lastLongStart = end - 8;
126+
for (; offset <= lastLongStart; offset += 8) {
127127
// Count bytes which are NOT the start of a code point
128128
continuationBytesCount += countContinuationBytes(utf8.getLongUnchecked(offset));
129129
}
130130
// Enough bytes left for 32 bits?
131-
if (offset + 4 < length) {
131+
if (offset <= end - 4) {
132132
// Count bytes which are NOT the start of a code point
133133
continuationBytesCount += countContinuationBytes(utf8.getIntUnchecked(offset));
134134

135135
offset += 4;
136136
}
137137
// Do the rest one by one
138-
for (; offset < length; offset++) {
138+
for (; offset < end; offset++) {
139139
// Count bytes which are NOT the start of a code point
140140
continuationBytesCount += countContinuationBytes(utf8.getByteUnchecked(offset));
141141
}

src/test/java/io/airlift/slice/TestSliceUtf8.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ public void testCodePointCount()
192192
assertThat(countCodePoints(wrappedBuffer(CONTINUATION_BYTE))).isEqualTo(0);
193193
}
194194

195+
@Test
196+
public void testCodePointCountRange()
197+
{
198+
Slice utf8 = utf8Slice("€é€😀😀😀");
199+
for (int offset = 0; offset <= utf8.length(); offset++) {
200+
for (int length = 0; length <= utf8.length() - offset; length++) {
201+
assertThat(countCodePoints(utf8, offset, length))
202+
.isEqualTo(countCodePoints(utf8.slice(offset, length)));
203+
}
204+
}
205+
}
206+
195207
private static void assertCodePointCount(String string)
196208
{
197209
assertThat(countCodePoints(utf8Slice(string))).isEqualTo(string.codePoints().count());

0 commit comments

Comments
 (0)