@@ -35,6 +35,7 @@ const byte TextCodec::MASK_3F = byte(0x3F);
3535const byte TextCodec::MASK_20 = byte(0x20 );
3636const byte TextCodec::MASK_40 = byte(0x40 );
3737const byte TextCodec::MASK_80 = byte(0x80 );
38+ const byte TextCodec::MASK_FLIP_CASE = byte(0x80 );
3839const int TextCodec::HASH1 = 0x7FEB352D ;
3940const int TextCodec::HASH2 = 0x846CA68B ;
4041const byte TextCodec::CR = byte(0x0D );
@@ -775,14 +776,16 @@ int TextCodec1::emitWordIndex(byte dst[], int val)
775776{
776777 // Emit word index (varint 5 bits + 7 bits + 7 bits)
777778 if (val >= TextCodec::THRESHOLD1) {
778- int dstIdx = 0 ;
779-
780- if (val >= TextCodec::THRESHOLD2)
781- dst[dstIdx++] = byte (0xE0 | (val >> 14 ));
779+ if (val >= TextCodec::THRESHOLD2) {
780+ dst[0 ] = byte (0xE0 | (val >> 14 ));
781+ dst[1 ] = byte (0x80 | (val >> 7 ));
782+ dst[2 ] = byte (0x7F & val);
783+ return 3 ;
784+ }
782785
783- dst[dstIdx ] = byte (0x80 | (val >> 7 ));
784- dst[dstIdx + 1 ] = byte (0x7F & val);
785- return dstIdx + 2 ;
786+ dst[0 ] = byte (0x80 | (val >> 7 ));
787+ dst[1 ] = byte (0x7F & val);
788+ return 2 ;
786789 }
787790
788791 dst[0 ] = byte (val);
@@ -1148,7 +1151,7 @@ bool TextCodec2::forward(SliceArray<byte>& input, SliceArray<byte>& output, int
11481151 }
11491152
11501153 // Case flip is encoded as 0x80
1151- dst[dstIdx] = TextCodec::MASK_80 ;
1154+ dst[dstIdx] = TextCodec::MASK_FLIP_CASE ;
11521155 dstIdx += (pe == pe1 ? 0 : 1 );
11531156 dstIdx += emitWordIndex (&dst[dstIdx], pe->_data & TextCodec::MASK_LENGTH);
11541157 emitAnchor = delimAnchor + 1 + (pe->_data >> 24 );
@@ -1283,28 +1286,25 @@ int TextCodec2::emitSymbols(const byte src[], byte dst[], const int srcEnd, cons
12831286
12841287int TextCodec2::emitWordIndex (byte dst[], int wIdx)
12851288{
1286- // Increment word index because 0x80 is reserved to first symbol case flip
1289+ // 0x80 is reserved to first symbol case flip
12871290 wIdx++;
12881291
1289- // Emit word index (varint 6 bits + 7 bits + 7 bits)
1290- // first byte: 0x80 => word idx, 0x40 => more bytes
1291- // next bytes: 0x80 => 1 more byte
12921292 if (wIdx >= TextCodec::THRESHOLD3) {
12931293 if (wIdx >= TextCodec::THRESHOLD4) {
1294- // 6 + 7 + 7 => 2^20 = 64*128*128
1295- dst[0 ] = byte (0xC0 | (wIdx >> 14 ));
1296- dst[1 ] = byte (0x80 | ( wIdx >> 7 ) );
1297- dst[2 ] = byte (wIdx & 0x7F );
1294+ // 3 byte index (1111xxxx xxxxxxxx xxxxxxxx)
1295+ dst[0 ] = byte (0xF0 | (wIdx >> 16 ));
1296+ dst[1 ] = byte (wIdx >> 8 );
1297+ dst[2 ] = byte (wIdx);
12981298 return 3 ;
12991299 }
13001300
1301- // 6 + 7 => 2^13 = 64*128
1302- dst[0 ] = byte (0xC0 | (wIdx >> 7 ));
1303- dst[1 ] = byte (wIdx & 0x7F );
1301+ // 2 byte index (110xxxxx xxxxxxxx)
1302+ dst[0 ] = byte (0xC0 | (wIdx >> 8 ));
1303+ dst[1 ] = byte (wIdx);
13041304 return 2 ;
13051305 }
13061306
1307- // 6 => 64
1307+ // 1 byte index (10xxxxxx) with 0x80 excluded
13081308 dst[0 ] = byte (0x80 | wIdx);
13091309 return 1 ;
13101310}
@@ -1422,18 +1422,20 @@ bool TextCodec2::inverse(SliceArray<byte>& input, SliceArray<byte>& output, int
14221422 cur = src[srcIdx++];
14231423 }
14241424
1425- // Read word index (varint 6 bits + 7 bits + 7 bits)
1426- idx = int (cur & TextCodec::MASK_3F);
1427-
1428- if ((cur & TextCodec::MASK_40) != byte ( 0 )) {
1429- const int idx2 = int (src[srcIdx++]) ;
1425+ // Read word index
1426+ // 10xxxxxx => 1 byte
1427+ // 110xxxxx => 2 bytes
1428+ // 1111xxxx => 3 bytes
1429+ idx = int (cur) & 0x7F ;
14301430
1431- if (idx2 >= 128 ) {
1432- idx = (idx << 14 ) | ((idx2 & 0x7F ) << 7 ) | int (src[srcIdx]);
1433- srcIdx++;
1431+ if (idx >= 64 ) {
1432+ if (idx >= 112 ) {
1433+ idx = ((idx & 0x0F ) << 16 ) | (int (src[srcIdx]) << 8 ) | int (src[srcIdx + 1 ]);
1434+ srcIdx += 2 ;
14341435 }
14351436 else {
1436- idx = (idx << 7 ) | idx2;
1437+ idx = ((idx & 0x1F ) << 8 ) | int (src[srcIdx]);
1438+ srcIdx++;
14371439 }
14381440
14391441 // Sanity check before adjusting index
0 commit comments