diff --git a/htscodecs/tokenise_name3.c b/htscodecs/tokenise_name3.c index 08e6261..b450d9a 100644 --- a/htscodecs/tokenise_name3.c +++ b/htscodecs/tokenise_name3.c @@ -681,6 +681,51 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in return *exact ? from : p3; } +enum char_types { + DIGIT = 0, // 0b0000 0000 + HEXDIGIT_LOWER = 1, // 0b0000 0001 + HEXDIGIT_UPPER = 2, // 0b0000 0010 + STRING = 3, // 0b0000 0011 + SEPARATION = 7, // 0b0000 0111 +}; + +#define D DIGIT +#define L HEXDIGIT_LOWER +#define U HEXDIGIT_UPPER +#define S STRING +#define P SEPARATION + +/* Alternative classification table. +Contrary to `ispunct` some characters are classified as the string type: +$, !, ? are normal values in Strings and not used to indicate field separators. +, + and & are often used for multiple values in the same field and should thus +not be separated. */ +static uint8_t CHAR_TO_TYPE[256] = { +// Control characters + P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, + P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, +// , !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / + P, S, P, P, S, P, S, P, P, P, P, S, P, P, P, P, +// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ? + D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, S, +// @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, + S, U, U, U, U, U, U, S, S, S, S, S, S, S, S, S, +// P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, + S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P, +// `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, + P, L, L, L, L, L, L, S, S, S, S, S, S, S, S, S, +// p, q, r, s, t, u, v, w, x, y, z, {, }, |, ~, + S, S, S, S, S, S, S, S, S, S, S, P, P, P, P, P, +// Assume all non-ASCII characters are strings + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, +}; //----------------------------------------------------------------------------- // Name encoder @@ -764,121 +809,159 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) { ctx->max_tok = ntok+1; } - /* Determine data type of this segment */ - if (isalpha((uint8_t)name[i])) { - int s = i+1; -// int S = i+1; - -// // FIXME: try which of these is best. alnum is good sometimes. -// while (s < len && isalpha((uint8_t)name[s])) - while (s < len && (isalpha((uint8_t)name[s]) || - ispunct((uint8_t)name[s]))) -// while (s < len && name[s] != ':') -// while (s < len && !isdigit((uint8_t)name[s]) && name[s] != ':') - s++; - -// if (!is_fixed) { -// while (S < len && isalnum((uint8_t)name[S])) -// S++; -// if (s < S) -// s = S; -// } + uint8_t first_char = (uint8_t)name[i]; + if (CHAR_TO_TYPE[first_char] == SEPARATION) { + /* Treat separation as seperate tokens. */ + goto n_char; + } + + /* Determine segment length and data type */ + int s = i; + uint8_t token_type = DIGIT; + while (s < len) { + uint8_t c = (uint8_t)name[s]; + uint8_t char_type = CHAR_TO_TYPE[c]; + if (char_type == SEPARATION) { + break; + } + token_type |= char_type; + s+=1; + } + char *token = name + i; + int token_length = s - i; + int token_starts_with_zero = first_char == '0'; + int token_is_number = token_type == DIGIT; + uint32_t v = 0; + if (token_type == DIGIT) { + /* Do not encode larger numbers because of uint32_t limits */ + if (token_length > 9) { + token_length = 9; + } + for (int j=0; j < token_length; j++) { + v = v * 10 + token[j] - '0'; + } + } + if (token_type == HEXDIGIT_LOWER || token_type == HEXDIGIT_UPPER) { + if (token_length > 16) { + // Hexadecimals more than 64-bits are unlikely. This is a most + // likely just a string. + token_type = STRING; + } + } + if (token_type == STRING && token_length > 1) { + // Determine if it is prefix or suffix + size_t digit_start = token_length; + size_t i = 0; + for (;i < token_length; i++) { + if (CHAR_TO_TYPE[(uint8_t)token[i]] == DIGIT) { + digit_start = i; + break; + } + } + for (; i < token_length; i++) { + if (CHAR_TO_TYPE[(uint8_t)token[i]] != DIGIT) { + break; + } + } + int digit_end = i; + int remainder_all_string = 1; + for (; i < token_length; i++) { + if (CHAR_TO_TYPE[(uint8_t)token[i] == DIGIT]) { + remainder_all_string = 0; + } + } + if (digit_start == 0 && remainder_all_string) { + // suffix + token_is_number = 1; + token_length = digit_end - digit_start; + } + if (digit_end == token_length && digit_start != token_length) { + // prefix + // Only encode the prefix as a string and come back for the number later. + token_length = digit_start; + } + } + + if (!token_is_number) { // Single byte strings are better encoded as chars. - if (s-i == 1) goto n_char; + if (token_length == 1) goto n_char; if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) { - if (s-i == ctx->lc[pnum].last[ntok].token_int && - memcmp(&name[i], + if (token_length == ctx->lc[pnum].last[ntok].token_int && + memcmp(token, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], - s-i) == 0) { + token_length) == 0) { #ifdef ENC_DEBUG - fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]); + fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, token_length, token); #endif if (encode_token_match(ctx, ntok) < 0) return -1; } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA, - s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], s-i, &name[i]); + token_length, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], token_length, token); #endif // same token/length, but mismatches - if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; + if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1; } } else { #ifdef ENC_DEBUG - fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]); + fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, token_length, token); #endif - if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; + if (encode_token_alpha(ctx, ntok, token, token_length) < 0) return -1; } - ctx->lc[cnum].last[ntok].token_int = s-i; + ctx->lc[cnum].last[ntok].token_int = token_length; ctx->lc[cnum].last[ntok].token_str = i; ctx->lc[cnum].last[ntok].token_type = N_ALPHA; i = s-1; - } else if (name[i] == '0') digits0: { + } else if (token_starts_with_zero && token_is_number) digits0: { // Digits starting with zero; encode length + value - uint32_t s = i; - uint32_t v = 0; int d = 0; - - while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) { - v = v*10 + name[s] - '0'; - //putchar(name[s]); - s++; - } - // TODO: optimise choice over whether to switch from DIGITS to DELTA // regularly vs all DIGITS, also MATCH vs DELTA 0. if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0) { d = v - ctx->lc[pnum].last[ntok].token_int; - if (d == 0 && ctx->lc[pnum].last[ntok].token_str == s-i) { + if (d == 0 && ctx->lc[pnum].last[ntok].token_str == token_length) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v); #endif if (encode_token_match(ctx, ntok) < 0) return -1; //ctx->lc[pnum].last[ntok].token_delta=0; - } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) { + } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == token_length) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig0-delta, %d / %d)\n", N_DDELTA0, ctx->lc[pnum].last[ntok].token_int, v); #endif - //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; + //if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1; if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1; //ctx->lc[pnum].last[ntok].token_delta=1; } else { #ifdef ENC_DEBUG - fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, s-i); + fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, token_length); #endif - if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; + if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1; if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; //ctx->lc[pnum].last[ntok].token_delta=0; } } else { #ifdef ENC_DEBUG - fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, s-i); + fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, token_length); #endif - if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; + if (encode_token_int1_(ctx, ntok, N_DZLEN, token_length) < 0) return -1; if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; //ctx->lc[pnum].last[ntok].token_delta=0; } - ctx->lc[cnum].last[ntok].token_str = s-i; // length + ctx->lc[cnum].last[ntok].token_str = token_length; ctx->lc[cnum].last[ntok].token_int = v; ctx->lc[cnum].last[ntok].token_type = N_DIGITS0; - i = s-1; - } else if (isdigit((uint8_t)name[i])) { + i = i + token_length - 1; + } else if (token_is_number) { // digits starting 1-9; encode value - uint32_t s = i; - uint32_t v = 0; int d = 0; - while (s < len && isdigit((uint8_t)name[s]) && s-i < 9) { - v = v*10 + name[s] - '0'; - //putchar(name[s]); - s++; - } - // dataset/10/K562_cytosol_LID8465_TopHat_v2.names // col 4 is Illumina lane - we don't want match & delta in there // as it has multiple lanes (so not ALL match) and delta is just @@ -892,7 +975,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) { // width, sometimes with leading zeros. if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0 && - ctx->lc[pnum].last[ntok].token_str == s-i) + ctx->lc[pnum].last[ntok].token_str == token_length) goto digits0; // TODO: optimise choice over whether to switch from DIGITS to DELTA @@ -936,7 +1019,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) { ctx->lc[cnum].last[ntok].token_int = v; ctx->lc[cnum].last[ntok].token_type = N_DIGITS; - i = s-1; + i = i + token_length - 1; } else { n_char: //if (!isalpha((uint8_t)name[i])) putchar(name[i]);