Skip to content

Commit 349928d

Browse files
committed
Add arith+bzip2 support to name tokeniser
Also optimises method used per data type. For example CHAR and ALPHA are letters/symbols and don't benefit from STRIPE mode. Conversely DUP, DIFF, DIGITS and DIGITS0 are always guaranteed to be 32-bit ints so do benefit greatly. This allows us to cut out a lot of the brute force work, offerring faster encoding. Benchmarks vs develop: Level Old size/time New 1 6052123 0m1.658s 4909581 0m1.567s 3 4924296 0m1.755s 4808368 0m1.690s (default cram3.1) 5 4780927 0m2.859s 4768044 0m2.099s 7 4754297 0m4.028s 4758883 0m2.279s 9 4753731 0m4.353s 4753732 0m3.174s 11 5998241 0m2.661s 4787219 0m2.483s 13 4896975 0m2.994s 4703920 0m3.052s 15 4677274 0m4.809s 4656469 0m3.915s 17 4620982 0m7.845s 4629877 0m4.097s 19 4620851 0m9.694s 4621212 0m6.346s It's particularly faster at the higher compression levels, and noticably smaller at level 1 as we only try one method but it's the best for that type of data. Tested on a broad mix of read names from multiple platforms and in pos-sorted and name-sorted order.
1 parent 0fa0c9c commit 349928d

File tree

1 file changed

+128
-33
lines changed

1 file changed

+128
-33
lines changed

htscodecs/tokenise_name3.c

Lines changed: 128 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,60 +1241,155 @@ static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t
12411241
return clen+nb;
12421242
}
12431243

1244-
static int compress(uint8_t *in, uint64_t in_len, int level, int use_arith,
1244+
static int compress(uint8_t *in, uint64_t in_len, enum name_type type,
1245+
int level, int use_arith,
12451246
uint8_t *out, uint64_t *out_len) {
12461247
uint64_t best_sz = UINT64_MAX;
1247-
int best = 0;
12481248
uint64_t olen = *out_len;
1249+
int ret = -1;
12491250

1250-
//fprintf(stderr, "=== try %d ===\n", (int)in_len);
1251-
1252-
int m, rmethods[5][12] = {
1253-
{2, 0, 128}, // 1
1254-
{2, 0, 192+8}, // 3
1255-
{3, 0, 128, 193+8}, // 5
1256-
{6, 0,1, 129, 65, 193, 193+8}, // 7
1257-
{9, 0,1,128,129,64,65,192,193, 193+8}, // 9
1258-
};
1259-
1260-
// 1-9 to 0-4
1251+
// Map levels 1-9 to 0-4, for parameter lookup in R[] below
12611252
level = (level-1)/2;
12621253
if (level<0) level=0;
12631254
if (level>4) level=4;
12641255

1265-
for (m = 1; m <= rmethods[level][0]; m++) {
1256+
// rANS4x16pr and arith_dynamic parameters to explore.
1257+
// We brute force these, so fast levels test 1 setting and slow test more
1258+
int R[5][N_ALL][7] = {
1259+
{ // -1
1260+
/* TYPE */ {1, 128},
1261+
/* ALPHA */ {1, 129},
1262+
/* CHAR */ {1, 0},
1263+
/* DIGITS0 */ {1, 8},
1264+
/* DZLEN */ {1, 0},
1265+
/* DUP */ {1, 8},
1266+
/* DIFF */ {1, 8},
1267+
/* DIGITS */ {1, 8},
1268+
/* DDELTA */ {1, 0},
1269+
/* DDELTA0 */ {1, 128},
1270+
/* MATCH */ {1, 0},
1271+
/* NOP */ {1, 0},
1272+
/* END */ {1, 0}
1273+
},
1274+
1275+
{ // -3
1276+
/* TYPE */ {2, 192,0},
1277+
/* ALPHA */ {2, 129,1},
1278+
/* CHAR */ {1, 0},
1279+
/* DIGITS0 */ {2, 128+8,0}, // size%4==0
1280+
/* DZLEN */ {1, 0},
1281+
/* DUP */ {1, 192+8}, // size%4==0
1282+
/* DIFF */ {1, 128+8}, // size%4==0
1283+
/* DIGITS */ {1, 192+8}, // size%4==0
1284+
/* DDELTA */ {1, 0},
1285+
/* DDELTA0 */ {1, 128},
1286+
/* MATCH */ {1, 0},
1287+
/* NOP */ {1, 0},
1288+
/* END */ {1, 0}
1289+
},
1290+
1291+
{ // -5
1292+
/* TYPE */ {2, 192,0},
1293+
/* ALPHA */ {4, 1,128,0,129},
1294+
/* CHAR */ {1, 0},
1295+
/* DIGITS0 */ {2, 200,0},
1296+
/* DZLEN */ {1, 0},
1297+
/* DUP */ {1, 200},
1298+
/* DIFF */ {2, 192,200},
1299+
/* DIGITS */ {2, 132,201},
1300+
/* DDELTA */ {1, 0},
1301+
/* DDELTA0 */ {1, 128},
1302+
/* MATCH */ {1, 0},
1303+
/* NOP */ {1, 0},
1304+
/* END */ {1, 0}
1305+
},
1306+
1307+
{ // -7
1308+
/* TYPE */ {3, 193,0,1},
1309+
/* ALPHA */ {5, 128, 1,128,0,129},
1310+
/* CHAR */ {2, 1,0},
1311+
/* DIGITS0 */ {2, 200,0}, // or 201,0
1312+
/* DZLEN */ {1, 0},
1313+
/* DUP */ {1, 201},
1314+
/* DIFF */ {2, 192,200}, // or 192,201
1315+
/* DIGITS */ {2, 132, 201}, // +bz2 here and -9
1316+
/* DDELTA */ {1, 0},
1317+
/* DDELTA0 */ {1, 128},
1318+
/* MATCH */ {1, 0},
1319+
/* NOP */ {1, 0},
1320+
/* END */ {1, 0}
1321+
},
1322+
1323+
{ // -9
1324+
/* TYPE */ {6, 192,0,1, 65, 193,132},
1325+
/* ALPHA */ {4, 132, 1, 0,129},
1326+
/* CHAR */ {3, 1,0,192},
1327+
/* DIGITS0 */ {4, 201,0, 192,64},
1328+
/* DZLEN */ {3, 0,128,1},
1329+
/* DUP */ {1, 201},
1330+
/* DIFF */ {3, 192, 201,65},
1331+
/* DIGITS */ {6, 132, 201,1, 192,129, 193},
1332+
/* DDELTA */ {3, 1,0, 192},
1333+
/* DDELTA0 */ {3, 192,1, 0},
1334+
/* MATCH */ {1, 0},
1335+
/* NOP */ {1, 0},
1336+
/* END */ {1, 0}
1337+
},
1338+
};
1339+
// Minor tweak to level 3 DIGITS if arithmetic, to use O(201) instead.
1340+
if (use_arith) R[1][N_DIGITS][1]=201;
1341+
1342+
int *meth = R[level][type];
1343+
1344+
int last = 0, m;
1345+
uint8_t best_static[8192];
1346+
uint8_t *best_dat = best_static;
1347+
for (m = 1; m <= meth[0]; m++) {
12661348
*out_len = olen;
12671349

1268-
if (in_len % 4 != 0 && (rmethods[level][m] & 8))
1350+
if (!use_arith && (meth[m] & 4))
1351+
meth[m] &= ~4;
1352+
1353+
if (in_len % 4 != 0 && (meth[m] & 8))
12691354
continue;
12701355

1356+
last = 0;
12711357
if (use_arith) {
1272-
if (arith_encode(in, in_len, out, out_len, rmethods[level][m]) < 0)
1273-
return -1;
1358+
if (arith_encode(in, in_len, out, out_len, meth[m]) <0)
1359+
goto err;
12741360
} else {
1275-
if (rans_encode(in, in_len, out, out_len, rmethods[level][m]) < 0)
1276-
return -1;
1361+
if (rans_encode(in, in_len, out, out_len, meth[m]) < 0)
1362+
goto err;
12771363
}
12781364

12791365
if (best_sz > *out_len) {
12801366
best_sz = *out_len;
1281-
best = rmethods[level][m];
1367+
last = 1;
1368+
1369+
if (m+1 > meth[0])
1370+
// no need to memcpy if we're not going to overwrite out
1371+
break;
1372+
1373+
if (best_sz > 8192 && best_dat == best_static) {
1374+
// No need to realloc as best_sz only ever decreases
1375+
best_dat = malloc(best_sz);
1376+
if (!best_dat)
1377+
return -1;
1378+
}
1379+
memcpy(best_dat, out, best_sz);
12821380
}
12831381
}
12841382

1285-
*out_len = olen;
1286-
if (use_arith) {
1287-
if (arith_encode(in, in_len, out, out_len, best) < 0)
1288-
return -1;
1289-
} else {
1290-
if (rans_encode(in, in_len, out, out_len, best) < 0)
1291-
return -1;
1292-
}
1383+
if (!last)
1384+
memcpy(out, best_dat, best_sz);
1385+
*out_len = best_sz;
1386+
ret = 0;
12931387

1294-
// uint64_t tmp;
1295-
// fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]);
1388+
err:
1389+
if (best_dat != best_static)
1390+
free(best_dat);
12961391

1297-
return 0;
1392+
return ret;
12981393
}
12991394

13001395
static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) {
@@ -1446,8 +1541,8 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
14461541
return NULL;
14471542
}
14481543

1449-
if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, level, use_arith,
1450-
out, &out_len) < 0) {
1544+
if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, i&0xf, level,
1545+
use_arith, out, &out_len) < 0) {
14511546
free_context(ctx);
14521547
return NULL;
14531548
}

0 commit comments

Comments
 (0)