Skip to content

Commit 518c223

Browse files
committed
improved gc_content performance
1 parent a6cc499 commit 518c223

File tree

5 files changed

+50
-47
lines changed

5 files changed

+50
-47
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,4 @@ zlib-1.3.1.zip
9797
indexed_gzip-1.10.3.zip
9898
pyfastx.cpython-312-x86_64-linux-gnu.so
9999
src.zip
100+
pyfastx.cpython-313-darwin.so

src/fasta.c

Lines changed: 34 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ PyObject *pyfastx_fasta_new(PyTypeObject *type, PyObject *args, PyObject *kwargs
121121
PYFASTX_SQLITE_CALL(
122122
sqlite3_prepare_v2(obj->index->index_db, "SELECT * FROM seq WHERE chrom=? LIMIT 1;", -1, &obj->index->seq_stmt, NULL);
123123
sqlite3_prepare_v2(obj->index->index_db, "SELECT * FROM seq WHERE ID=? LIMIT 1;", -1, &obj->index->uid_stmt, NULL);
124-
sqlite3_prepare_v2(obj->index->index_db, "SELECT * FROM comp WHERE ID=? LIMIT 1;", -1, &obj->index->comp_stmt, NULL);
124+
sqlite3_prepare_v2(obj->index->index_db, "SELECT * FROM comp WHERE seqid=?;", -1, &obj->index->comp_stmt, NULL);
125125
);
126126
}
127127

@@ -868,6 +868,10 @@ void pyfastx_fasta_calc_composition(pyfastx_Fasta *self) {
868868
Py_ssize_t i;
869869
Py_ssize_t seqid = 0;
870870

871+
if (self->index->full_index) {
872+
return;
873+
}
874+
871875
sql = "SELECT * FROM comp LIMIT 1";
872876

873877
PYFASTX_SQLITE_CALL(
@@ -876,8 +880,10 @@ void pyfastx_fasta_calc_composition(pyfastx_Fasta *self) {
876880
sqlite3_finalize(stmt);
877881
);
878882

879-
if (ret == SQLITE_ROW)
883+
if (ret == SQLITE_ROW) {
884+
self->index->full_index = 1;
880885
return;
886+
}
881887

882888
stmt = NULL;
883889

@@ -944,18 +950,18 @@ void pyfastx_fasta_calc_composition(pyfastx_Fasta *self) {
944950
}
945951

946952
sqlite3_finalize(stmt);
953+
sqlite3_exec(self->index->index_db, "CREATE INDEX seqidx ON comp (seqid);", NULL, NULL, NULL);
947954
sqlite3_exec(self->index->index_db, "COMMIT;", NULL, NULL, NULL);
948955

949956
Py_END_ALLOW_THREADS
950957

958+
self->index->full_index = 1;
951959
ks_destroy(ks);
952960
free(line.s);
953961
}
954962

955963
PyObject *pyfastx_fasta_gc_content(pyfastx_Fasta *self, void* closure) {
956964
int ret;
957-
const char *sql;
958-
sqlite3_stmt *stmt;
959965

960966
int l;
961967
Py_ssize_t n;
@@ -965,18 +971,17 @@ PyObject *pyfastx_fasta_gc_content(pyfastx_Fasta *self, void* closure) {
965971
Py_ssize_t t = 0;
966972

967973
pyfastx_fasta_calc_composition(self);
968-
sql = "SELECT * FROM comp WHERE seqid=0";
969974

970975
PYFASTX_SQLITE_CALL(
971-
sqlite3_prepare_v2(self->index->index_db, sql, -1, &stmt, NULL);
972-
ret = sqlite3_step(stmt);
976+
sqlite3_bind_int64(self->index->comp_stmt, 1, 0);
977+
ret = sqlite3_step(self->index->comp_stmt);
973978
);
974979

975980
while (ret == SQLITE_ROW) {
976981
PYFASTX_SQLITE_CALL(
977-
l = sqlite3_column_int(stmt, 2);
978-
n = sqlite3_column_int64(stmt, 3);
979-
ret = sqlite3_step(stmt);
982+
l = sqlite3_column_int(self->index->comp_stmt, 2);
983+
n = sqlite3_column_int64(self->index->comp_stmt, 3);
984+
ret = sqlite3_step(self->index->comp_stmt);
980985
);
981986

982987
switch (l) {
@@ -1002,7 +1007,7 @@ PyObject *pyfastx_fasta_gc_content(pyfastx_Fasta *self, void* closure) {
10021007
}
10031008
}
10041009

1005-
PYFASTX_SQLITE_CALL(sqlite3_finalize(stmt));
1010+
PYFASTX_SQLITE_CALL(sqlite3_reset(self->index->comp_stmt));
10061011

10071012
if (a + c + g + t > 0) {
10081013
return Py_BuildValue("f", (float)(g+c)/(a+c+g+t)*100);
@@ -1014,27 +1019,24 @@ PyObject *pyfastx_fasta_gc_content(pyfastx_Fasta *self, void* closure) {
10141019

10151020
PyObject *pyfastx_fasta_gc_skew(pyfastx_Fasta *self, void* closure) {
10161021
int ret;
1017-
const char *sql;
1018-
sqlite3_stmt *stmt;
10191022

10201023
int l;
10211024
Py_ssize_t n;
10221025
Py_ssize_t c = 0;
10231026
Py_ssize_t g = 0;
10241027

10251028
pyfastx_fasta_calc_composition(self);
1026-
sql = "SELECT * FROM comp WHERE seqid=0";
10271029

10281030
PYFASTX_SQLITE_CALL(
1029-
sqlite3_prepare_v2(self->index->index_db, sql, -1, &stmt, NULL);
1030-
ret = sqlite3_step(stmt);
1031+
sqlite3_bind_int64(self->index->comp_stmt, 1, 0);
1032+
ret = sqlite3_step(self->index->comp_stmt);
10311033
);
10321034

10331035
while (ret == SQLITE_ROW) {
10341036
PYFASTX_SQLITE_CALL(
1035-
l = sqlite3_column_int(stmt, 2);
1036-
n = sqlite3_column_int64(stmt, 3);
1037-
ret = sqlite3_step(stmt);
1037+
l = sqlite3_column_int(self->index->comp_stmt, 2);
1038+
n = sqlite3_column_int64(self->index->comp_stmt, 3);
1039+
ret = sqlite3_step(self->index->comp_stmt);
10381040
);
10391041

10401042
switch (l) {
@@ -1050,7 +1052,7 @@ PyObject *pyfastx_fasta_gc_skew(pyfastx_Fasta *self, void* closure) {
10501052
}
10511053
}
10521054

1053-
PYFASTX_SQLITE_CALL(sqlite3_finalize(stmt));
1055+
PYFASTX_SQLITE_CALL(sqlite3_reset(self->index->comp_stmt));
10541056

10551057
if (c + g > 0) {
10561058
return Py_BuildValue("f", (float)(g-c)/(g+c));
@@ -1063,10 +1065,7 @@ PyObject *pyfastx_fasta_gc_skew(pyfastx_Fasta *self, void* closure) {
10631065
PyObject *pyfastx_fasta_composition(pyfastx_Fasta *self, void* closure) {
10641066
int l;
10651067
int ret;
1066-
const char *sql;
10671068

1068-
sqlite3_stmt *stmt;
1069-
10701069
Py_ssize_t n;
10711070

10721071
PyObject *d;
@@ -1076,19 +1075,18 @@ PyObject *pyfastx_fasta_composition(pyfastx_Fasta *self, void* closure) {
10761075
pyfastx_fasta_calc_composition(self);
10771076

10781077
//the last row store the sum of the each base
1079-
sql = "SELECT * FROM comp WHERE seqid=0";
10801078
PYFASTX_SQLITE_CALL(
1081-
sqlite3_prepare_v2(self->index->index_db, sql, -1, &stmt, NULL);
1082-
ret = sqlite3_step(stmt);
1079+
sqlite3_bind_int64(self->index->comp_stmt, 1, 0);
1080+
ret = sqlite3_step(self->index->comp_stmt);
10831081
);
10841082

10851083
d = PyDict_New();
10861084

10871085
while (ret == SQLITE_ROW) {
10881086
PYFASTX_SQLITE_CALL(
1089-
l = sqlite3_column_int(stmt, 2);
1090-
n = sqlite3_column_int64(stmt, 3);
1091-
ret = sqlite3_step(stmt);
1087+
l = sqlite3_column_int(self->index->comp_stmt, 2);
1088+
n = sqlite3_column_int64(self->index->comp_stmt, 3);
1089+
ret = sqlite3_step(self->index->comp_stmt);
10921090
);
10931091

10941092
if (n > 0 && l >= 32 && l < 127) {
@@ -1100,7 +1098,7 @@ PyObject *pyfastx_fasta_composition(pyfastx_Fasta *self, void* closure) {
11001098
}
11011099
}
11021100

1103-
PYFASTX_SQLITE_CALL(sqlite3_finalize(stmt));
1101+
PYFASTX_SQLITE_CALL(sqlite3_step(self->index->comp_stmt));
11041102

11051103
return d;
11061104
}
@@ -1114,28 +1112,24 @@ PyObject *pyfastx_fasta_guess_type(pyfastx_Fasta *self, void* closure) {
11141112

11151113
char *alphabets;
11161114
char *retval;
1117-
const char *sql;
1118-
1119-
sqlite3_stmt *stmt;
11201115

11211116
Py_ssize_t n;
11221117

11231118
pyfastx_fasta_calc_composition(self);
11241119

1125-
sql = "SELECT * FROM comp WHERE seqid=0";
11261120
PYFASTX_SQLITE_CALL(
1127-
sqlite3_prepare_v2(self->index->index_db, sql, -1, &stmt, NULL);
1128-
ret = sqlite3_step(stmt);
1121+
sqlite3_bind_int64(self->index->comp_stmt, 1, 0);
1122+
ret = sqlite3_step(self->index->comp_stmt);
11291123
);
11301124

11311125
i = 0;
11321126
alphabets = (char *)malloc(128);
11331127

11341128
while (ret == SQLITE_ROW) {
11351129
PYFASTX_SQLITE_CALL(
1136-
l = sqlite3_column_int(stmt, 2);
1137-
n = sqlite3_column_int64(stmt, 3);
1138-
ret = sqlite3_step(stmt);
1130+
l = sqlite3_column_int(self->index->comp_stmt, 2);
1131+
n = sqlite3_column_int64(self->index->comp_stmt, 3);
1132+
ret = sqlite3_step(self->index->comp_stmt);
11391133
);
11401134

11411135
if (l > 32 && l < 127 && n > 0) {
@@ -1144,7 +1138,7 @@ PyObject *pyfastx_fasta_guess_type(pyfastx_Fasta *self, void* closure) {
11441138
}
11451139

11461140
alphabets[i] = '\0';
1147-
PYFASTX_SQLITE_CALL(sqlite3_finalize(stmt));
1141+
PYFASTX_SQLITE_CALL(sqlite3_reset(self->index->comp_stmt));
11481142

11491143
if (is_subset("ACGTNacgtn", alphabets) || is_subset("abcdghkmnrstvwyABCDGHKMNRSTVWY*-", alphabets)) {
11501144
retval = "DNA";

src/index.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ pyfastx_Index* pyfastx_init_index(PyObject *obj, PyObject* file_obj, PyObject* i
3131
//full name
3232
index->full_name = full_name;
3333

34+
//full index
35+
index->full_index = 0;
36+
3437
//check input file is gzip or not
3538
index->gzip_format = is_gzip_format(file_obj);
3639

@@ -477,7 +480,7 @@ pyfastx_Sequence* pyfastx_index_new_seq(pyfastx_Index *self) {
477480

478481
//position
479482
seq->start = 1;
480-
seq->end = seq->seq_len;
483+
seq->end = 0;
481484

482485
//index
483486
seq->index = self;
@@ -516,6 +519,7 @@ PyObject *pyfastx_index_make_seq(pyfastx_Index *self, sqlite3_stmt *stmt){
516519
seq->normal = sqlite3_column_int(stmt, 7);
517520
seq->desc_len = sqlite3_column_int(stmt, 8);
518521
);
522+
seq->end = seq->seq_len;
519523

520524
return (PyObject *)seq;
521525
}

src/index.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ typedef struct {
1919
//full name
2020
int full_name;
2121

22+
//full index
23+
int full_index;
24+
2225
//is gzip compressed file
2326
//0 not gzip file
2427
//1 is gzip file

src/sequence.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ PyObject *pyfastx_sequence_gc_content(pyfastx_Sequence *self, void* closure) {
574574

575575
if (ret == SQLITE_ROW && self->start == 1 && self->end == self->seq_len) {
576576
while (ret == SQLITE_ROW) {
577+
577578
PYFASTX_SQLITE_CALL(
578579
l = sqlite3_column_int(self->index->comp_stmt, 2);
579580
n = sqlite3_column_int64(self->index->comp_stmt, 3);
@@ -705,18 +706,18 @@ PyObject *pyfastx_sequence_composition(pyfastx_Sequence *self, void* closure) {
705706
PyObject *c;
706707

707708
PYFASTX_SQLITE_CALL(
708-
sqlite3_bind_int64(self->index->stmt, 1, self->id);
709-
ret = sqlite3_step(self->index->stmt);
709+
sqlite3_bind_int64(self->index->comp_stmt, 1, self->id);
710+
ret = sqlite3_step(self->index->comp_stmt);
710711
);
711712

712713
d = PyDict_New();
713714

714715
if (ret == SQLITE_ROW && self->start == 1 && self->end == self->seq_len) {
715716
while (ret == SQLITE_ROW) {
716717
PYFASTX_SQLITE_CALL(
717-
l = sqlite3_column_int(self->index->stmt, 2);
718-
n = sqlite3_column_int64(self->index->stmt, 3);
719-
ret = sqlite3_step(self->index->stmt);
718+
l = sqlite3_column_int(self->index->comp_stmt, 2);
719+
n = sqlite3_column_int64(self->index->comp_stmt, 3);
720+
ret = sqlite3_step(self->index->comp_stmt);
720721
);
721722

722723
if (n > 0 && l >= 32 && l < 127) {
@@ -747,7 +748,7 @@ PyObject *pyfastx_sequence_composition(pyfastx_Sequence *self, void* closure) {
747748
}
748749
}
749750

750-
PYFASTX_SQLITE_CALL(sqlite3_reset(self->index->stmt));
751+
PYFASTX_SQLITE_CALL(sqlite3_reset(self->index->comp_stmt));
751752
return d;
752753
}
753754

0 commit comments

Comments
 (0)