-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathasindex.c
More file actions
153 lines (118 loc) · 3.42 KB
/
asindex.c
File metadata and controls
153 lines (118 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#include <stdio.h>
#include <zlib.h>
#include "kseq.h"
#include "mutils.h"
#include "psascan/sa_use.h"
#include "fmidx/fmidx.h"
#include "lchash/lchash.h"
KSEQ_INIT(gzFile, gzread)
char revc_mapper[256];
static inline void _seq_to_upper_case(char *seq, size_t length);
static mstring ms_from_ks(kstring_t ks) {
mstring ms = {.l = ks.l, .s = strndup(ks.s, ks.l)};
return ms;
}
static void _cstr_reverse_inplace(char *s) {
size_t l = strlen(s);
char c;
for (size_t i = 0; i < l / 2; ++i) {
c = s[i];
s[i] = s[l - i - 1];
s[l - i - 1] = c;
}
}
static int32_t _dna_rand_ch() {
static int32_t val = 0;
static int pos = -1;
if (pos < 0) {
val = (int32_t) lrand48();
pos = 0;
} else if (pos < 31) {
pos += 2;
} else {
val = (int32_t) lrand48();
pos = 0;
}
return (val >> pos) & 0x3;
}
static void _dna_replace_n_inplace(char *seq, size_t length) {
for (size_t i = 0; i < length; ++i) {
if (seq[i] == 'n' || seq[i] == 'N') {
int c = _dna_rand_ch();
seq[i] = "ACGT"[c];
}
}
}
static void _seq_to_upper_case(char *seq, size_t length) {
for (size_t i = 0; i < length; ++i) {
if (seq[i] > 0x60) seq[i] -= 0x20;
}
}
static void _dna_rev_complementary_inplace(char *seq, size_t length) {
for (size_t i = 0; i < length; ++i) {
seq[i] = revc_mapper[seq[i]];
}
_cstr_reverse_inplace(seq);
}
void create_meta(const char *prefix) {
char *mfn = cstr_concat(prefix, ".mta");
FILE *mfp = fopen(mfn, "w");
char *cfn = cstr_concat(prefix, ".cat");
FILE *cfp = fopen(cfn, "w");
uint64_t offset = 0;
gzFile fp = gzopen(prefix, "r");
kseq_t *seq = kseq_init(fp);
while (kseq_read(seq) >= 0) {
mstring seq_name = ms_from_ks(seq->name);
/// mta file contents
mstring_write(seq_name, mfp); // seq_name
fwrite(&offset, sizeof(uint64_t), 1, mfp); // offset
fwrite(&seq->seq.l, sizeof(seq->seq.l), 1, mfp); // seq_len
mstring_destroy(&seq_name);
/// cat file contents
char *seqs = strndup(seq->seq.s, seq->seq.l);
_dna_replace_n_inplace(seqs, seq->seq.l);
_seq_to_upper_case(seqs, seq->seq.l);
offset += fwrite(seqs, sizeof(char), seq->seq.l, cfp);
_dna_rev_complementary_inplace(seqs, seq->seq.l);
offset += fwrite(seqs, sizeof(char), seq->seq.l, cfp);
free(seqs);
}
kseq_destroy(seq);
gzclose(fp);
/// put a '$' at the end of cat file
char c = '$';
fwrite(&c, sizeof(char), 1, cfp);
fclose(cfp);
free(cfn);
fclose(mfp);
free(mfn);
}
void init() {
revc_mapper['a'] = revc_mapper['A'] = 'T';
revc_mapper['c'] = revc_mapper['C'] = 'G';
revc_mapper['g'] = revc_mapper['G'] = 'C';
revc_mapper['t'] = revc_mapper['T'] = 'A';
srand48(time(NULL));
}
int main(int argc, const char **argv) {
init();
create_meta(argv[1]);
char *cfn = cstr_concat(argv[1], ".cat");
dna_fmi idx;
// fmi_build(cfn, &idx, 128, 8L << 30);
printf("Start building fmindex\n");
fmi_build(cfn, &idx, 32, 8L << 30);
printf("Done building fmindex, start writing to disk\n");
fmi_write(idx, cfn);
int hash_len = 12;
printf("Start building lchash\n");
lc_hash hash = lc_build(&idx, hash_len);
char *lch_path = cstr_concat(cfn, ".lch");
printf("Start writing lchash to disk\n");
lc_write(lch_path, &hash);
lc_destroy(hash);
free(lch_path);
free(cfn);
return 0;
}