Skip to content

Commit b6ec9e0

Browse files
committed
Slightly better word index encoding
1 parent f25ff57 commit b6ec9e0

File tree

2 files changed

+32
-29
lines changed

2 files changed

+32
-29
lines changed

src/transform/TextCodec.cpp

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const byte TextCodec::MASK_3F = byte(0x3F);
3535
const byte TextCodec::MASK_20 = byte(0x20);
3636
const byte TextCodec::MASK_40 = byte(0x40);
3737
const byte TextCodec::MASK_80 = byte(0x80);
38+
const byte TextCodec::MASK_FLIP_CASE = byte(0x80);
3839
const int TextCodec::HASH1 = 0x7FEB352D;
3940
const int TextCodec::HASH2 = 0x846CA68B;
4041
const byte TextCodec::CR = byte(0x0D);
@@ -775,14 +776,16 @@ int TextCodec1::emitWordIndex(byte dst[], int val)
775776
{
776777
// Emit word index (varint 5 bits + 7 bits + 7 bits)
777778
if (val >= TextCodec::THRESHOLD1) {
778-
int dstIdx = 0;
779-
780-
if (val >= TextCodec::THRESHOLD2)
781-
dst[dstIdx++] = byte(0xE0 | (val >> 14));
779+
if (val >= TextCodec::THRESHOLD2) {
780+
dst[0] = byte(0xE0 | (val >> 14));
781+
dst[1] = byte(0x80 | (val >> 7));
782+
dst[2] = byte(0x7F & val);
783+
return 3;
784+
}
782785

783-
dst[dstIdx] = byte(0x80 | (val >> 7));
784-
dst[dstIdx + 1] = byte(0x7F & val);
785-
return dstIdx + 2;
786+
dst[0] = byte(0x80 | (val >> 7));
787+
dst[1] = byte(0x7F & val);
788+
return 2;
786789
}
787790

788791
dst[0] = byte(val);
@@ -1148,7 +1151,7 @@ bool TextCodec2::forward(SliceArray<byte>& input, SliceArray<byte>& output, int
11481151
}
11491152

11501153
// Case flip is encoded as 0x80
1151-
dst[dstIdx] = TextCodec::MASK_80;
1154+
dst[dstIdx] = TextCodec::MASK_FLIP_CASE;
11521155
dstIdx += (pe == pe1 ? 0 : 1);
11531156
dstIdx += emitWordIndex(&dst[dstIdx], pe->_data & TextCodec::MASK_LENGTH);
11541157
emitAnchor = delimAnchor + 1 + (pe->_data >> 24);
@@ -1283,28 +1286,25 @@ int TextCodec2::emitSymbols(const byte src[], byte dst[], const int srcEnd, cons
12831286

12841287
int TextCodec2::emitWordIndex(byte dst[], int wIdx)
12851288
{
1286-
// Increment word index because 0x80 is reserved to first symbol case flip
1289+
// 0x80 is reserved to first symbol case flip
12871290
wIdx++;
12881291

1289-
// Emit word index (varint 6 bits + 7 bits + 7 bits)
1290-
// first byte: 0x80 => word idx, 0x40 => more bytes
1291-
// next bytes: 0x80 => 1 more byte
12921292
if (wIdx >= TextCodec::THRESHOLD3) {
12931293
if (wIdx >= TextCodec::THRESHOLD4) {
1294-
// 6 + 7 + 7 => 2^20 = 64*128*128
1295-
dst[0] = byte(0xC0 | (wIdx >> 14));
1296-
dst[1] = byte(0x80 | (wIdx >> 7));
1297-
dst[2] = byte(wIdx & 0x7F);
1294+
// 3 byte index (1111xxxx xxxxxxxx xxxxxxxx)
1295+
dst[0] = byte(0xF0 | (wIdx >> 16));
1296+
dst[1] = byte(wIdx >> 8);
1297+
dst[2] = byte(wIdx);
12981298
return 3;
12991299
}
13001300

1301-
// 6 + 7 => 2^13 = 64*128
1302-
dst[0] = byte(0xC0 | (wIdx >> 7));
1303-
dst[1] = byte(wIdx & 0x7F);
1301+
// 2 byte index (110xxxxx xxxxxxxx)
1302+
dst[0] = byte(0xC0 | (wIdx >> 8));
1303+
dst[1] = byte(wIdx);
13041304
return 2;
13051305
}
13061306

1307-
// 6 => 64
1307+
// 1 byte index (10xxxxxx) with 0x80 excluded
13081308
dst[0] = byte(0x80 | wIdx);
13091309
return 1;
13101310
}
@@ -1422,18 +1422,20 @@ bool TextCodec2::inverse(SliceArray<byte>& input, SliceArray<byte>& output, int
14221422
cur = src[srcIdx++];
14231423
}
14241424

1425-
// Read word index (varint 6 bits + 7 bits + 7 bits)
1426-
idx = int(cur & TextCodec::MASK_3F);
1427-
1428-
if ((cur & TextCodec::MASK_40) != byte(0)) {
1429-
const int idx2 = int(src[srcIdx++]);
1425+
// Read word index
1426+
// 10xxxxxx => 1 byte
1427+
// 110xxxxx => 2 bytes
1428+
// 1111xxxx => 3 bytes
1429+
idx = int(cur) & 0x7F;
14301430

1431-
if (idx2 >= 128) {
1432-
idx = (idx << 14) | ((idx2 & 0x7F) << 7) | int(src[srcIdx]);
1433-
srcIdx++;
1431+
if (idx >= 64) {
1432+
if (idx >= 112) {
1433+
idx = ((idx & 0x0F) << 16) | (int(src[srcIdx]) << 8) | int(src[srcIdx + 1]);
1434+
srcIdx += 2;
14341435
}
14351436
else {
1436-
idx = (idx << 7) | idx2;
1437+
idx = ((idx & 0x1F) << 8) | int(src[srcIdx]);
1438+
srcIdx++;
14371439
}
14381440

14391441
// Sanity check before adjusting index

src/transform/TextCodec.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ namespace kanzi {
152152
static const byte MASK_20;
153153
static const byte MASK_40;
154154
static const byte MASK_80;
155+
static const byte MASK_FLIP_CASE;
155156

156157
TextCodec();
157158

0 commit comments

Comments
 (0)