Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 142 additions & 59 deletions htscodecs/tokenise_name3.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,51 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
return *exact ? from : p3;
}

enum char_types {
DIGIT = 0, // 0b0000 0000
HEXDIGIT_LOWER = 1, // 0b0000 0001
HEXDIGIT_UPPER = 2, // 0b0000 0010
STRING = 3, // 0b0000 0011
SEPARATION = 7, // 0b0000 0111
};

#define D DIGIT
#define L HEXDIGIT_LOWER
#define U HEXDIGIT_UPPER
#define S STRING
#define P SEPARATION

/* Alternative classification table.
Contrary to `ispunct` some characters are classified as the string type:
$, !, ? are normal values in Strings and not used to indicate field separators.
, + and & are often used for multiple values in the same field and should thus
not be separated. */
static uint8_t CHAR_TO_TYPE[256] = {
// Control characters
P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
// , !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /
P, S, P, P, S, P, S, P, P, P, P, S, P, P, P, P,
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?
D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, S,
// @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
S, U, U, U, U, U, U, S, S, S, S, S, S, S, S, S,
// P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _,
S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P,
// `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
P, L, L, L, L, L, L, S, S, S, S, S, S, S, S, S,
// p, q, r, s, t, u, v, w, x, y, z, {, }, |, ~,
S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P,
// Assume all non-ASCII characters are strings
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S,
};

//-----------------------------------------------------------------------------
// Name encoder
Expand Down Expand Up @@ -764,121 +809,159 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
ctx->max_tok = ntok+1;
}

/* Determine data type of this segment */
if (isalpha((uint8_t)name[i])) {
int s = i+1;
// int S = i+1;

// // FIXME: try which of these is best. alnum is good sometimes.
// while (s < len && isalpha((uint8_t)name[s]))
while (s < len && (isalpha((uint8_t)name[s]) ||
ispunct((uint8_t)name[s])))
// while (s < len && name[s] != ':')
// while (s < len && !isdigit((uint8_t)name[s]) && name[s] != ':')
s++;

// if (!is_fixed) {
// while (S < len && isalnum((uint8_t)name[S]))
// S++;
// if (s < S)
// s = S;
// }
uint8_t first_char = (uint8_t)name[i];
if (CHAR_TO_TYPE[first_char] == SEPARATION) {
/* Treat separation as seperate tokens. */
goto n_char;
}

/* Determine segment length and data type */
int s = i;
uint8_t token_type = DIGIT;
while (s < len) {
uint8_t c = (uint8_t)name[s];
uint8_t char_type = CHAR_TO_TYPE[c];
if (char_type == SEPARATION) {
break;
}
token_type |= char_type;
s+=1;
}
char *token = name + i;
int token_length = s - i;
int token_starts_with_zero = first_char == '0';
int token_is_number = token_type == DIGIT;
uint32_t v = 0;
if (token_type == DIGIT) {
/* Do not encode larger numbers because of uint32_t limits */
if (token_length > 9) {
token_length = 9;
}
for (int j=0; j < token_length; j++) {
v = v * 10 + token[j] - '0';
}
}
if (token_type == HEXDIGIT_LOWER || token_type == HEXDIGIT_UPPER) {
if (token_length > 16) {
// Hexadecimals more than 64-bits are unlikely. This is a most
// likely just a string.
token_type = STRING;
}
}
if (token_type == STRING && token_length > 1) {
// Determine if it is prefix<number> or <number>suffix
size_t digit_start = token_length;
size_t i = 0;
for (;i < token_length; i++) {
if (CHAR_TO_TYPE[(uint8_t)token[i]] == DIGIT) {
digit_start = i;
break;
}
}
for (; i < token_length; i++) {
if (CHAR_TO_TYPE[(uint8_t)token[i]] != DIGIT) {
break;
}
}
int digit_end = i;
int remainder_all_string = 1;
for (; i < token_length; i++) {
if (CHAR_TO_TYPE[(uint8_t)token[i] == DIGIT]) {
remainder_all_string = 0;
}
}
if (digit_start == 0 && remainder_all_string) {
// <number>suffix
token_is_number = 1;
token_length = digit_end - digit_start;
}
if (digit_end == token_length && digit_start != token_length) {
// prefix<number>
// Only encode the prefix as a string and come back for the number later.
token_length = digit_start;
}
}

if (!token_is_number) {

// Single byte strings are better encoded as chars.
if (s-i == 1) goto n_char;
if (token_length == 1) goto n_char;

if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) {
if (s-i == ctx->lc[pnum].last[ntok].token_int &&
memcmp(&name[i],
if (token_length == ctx->lc[pnum].last[ntok].token_int &&
memcmp(token,
&ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str],
s-i) == 0) {
token_length) == 0) {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]);
fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, token_length, token);
#endif
if (encode_token_match(ctx, ntok) < 0) return -1;
} else {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA,
s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], s-i, &name[i]);
token_length, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], token_length, token);
#endif
// same token/length, but mismatches
if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1;
}
} else {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]);
fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, token_length, token);
#endif
if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1;
}

ctx->lc[cnum].last[ntok].token_int = s-i;
ctx->lc[cnum].last[ntok].token_int = token_length;
ctx->lc[cnum].last[ntok].token_str = i;
ctx->lc[cnum].last[ntok].token_type = N_ALPHA;

i = s-1;
} else if (name[i] == '0') digits0: {
} else if (token_starts_with_zero && token_is_number) digits0: {
// Digits starting with zero; encode length + value
uint32_t s = i;
uint32_t v = 0;
int d = 0;

while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
v = v*10 + name[s] - '0';
//putchar(name[s]);
s++;
}

// TODO: optimise choice over whether to switch from DIGITS to DELTA
// regularly vs all DIGITS, also MATCH vs DELTA 0.
if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0) {
d = v - ctx->lc[pnum].last[ntok].token_int;
if (d == 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
if (d == 0 && ctx->lc[pnum].last[ntok].token_str == token_length) {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
#endif
if (encode_token_match(ctx, ntok) < 0) return -1;
//ctx->lc[pnum].last[ntok].token_delta=0;
} else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
} else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == token_length) {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (dig0-delta, %d / %d)\n", N_DDELTA0, ctx->lc[pnum].last[ntok].token_int, v);
#endif
//if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
//if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
//ctx->lc[pnum].last[ntok].token_delta=1;
} else {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, s-i);
fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, token_length);
#endif
if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
//ctx->lc[pnum].last[ntok].token_delta=0;
}
} else {
#ifdef ENC_DEBUG
fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, s-i);
fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, token_length);
#endif
if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1;
if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
//ctx->lc[pnum].last[ntok].token_delta=0;
}

ctx->lc[cnum].last[ntok].token_str = s-i; // length
ctx->lc[cnum].last[ntok].token_str = token_length;
ctx->lc[cnum].last[ntok].token_int = v;
ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;

i = s-1;
} else if (isdigit((uint8_t)name[i])) {
i = i + token_length - 1;
} else if (token_is_number) {
// digits starting 1-9; encode value
uint32_t s = i;
uint32_t v = 0;
int d = 0;

while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) {
v = v*10 + name[s] - '0';
//putchar(name[s]);
s++;
}

// dataset/10/K562_cytosol_LID8465_TopHat_v2.names
// col 4 is Illumina lane - we don't want match & delta in there
// as it has multiple lanes (so not ALL match) and delta is just
Expand All @@ -892,7 +975,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
// width, sometimes with leading zeros.
if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok &&
ctx->lc[pnum].last[ntok].token_type == N_DIGITS0 &&
ctx->lc[pnum].last[ntok].token_str == s-i)
ctx->lc[pnum].last[ntok].token_str == token_length)
goto digits0;

// TODO: optimise choice over whether to switch from DIGITS to DELTA
Expand Down Expand Up @@ -936,7 +1019,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
ctx->lc[cnum].last[ntok].token_int = v;
ctx->lc[cnum].last[ntok].token_type = N_DIGITS;

i = s-1;
i = i + token_length - 1;
} else {
n_char:
//if (!isalpha((uint8_t)name[i])) putchar(name[i]);
Expand Down