-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_classifications_db.py
More file actions
78 lines (70 loc) · 2.83 KB
/
build_classifications_db.py
File metadata and controls
78 lines (70 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import argparse
import gzip
import numpy as np
from pathlib import Path
from consts import CHROMOSOME_TO_INT_GH38, PARTIAL_SUMS_GH38, CLASSIFICATIONS_ORDER_GH38, PARTIAL_SUMS_GH37, \
CHROMOSOME_TO_INT_GH37, CLASSIFICATIONS_ORDER_GH37
GENOME_LENGTH = {
37: 6203609478,
38: 6198822410,
}
PARTIAL_SUMS = {
37: PARTIAL_SUMS_GH37,
38: PARTIAL_SUMS_GH38
}
CHROMOSOME_TO_INT = {
37: CHROMOSOME_TO_INT_GH37,
38: CHROMOSOME_TO_INT_GH38
}
CLASSIFICATIONS_ORDER = {
37: CLASSIFICATIONS_ORDER_GH37,
38: CLASSIFICATIONS_ORDER_GH38
}
NEGATIVE_STRAND = {
37: 83,
38:193
}
def init_classifications_db(input_file, reference_genome):
chromosome_int = CHROMOSOME_TO_INT[reference_genome]
partial_sums = PARTIAL_SUMS[reference_genome]
genome_length = GENOME_LENGTH[reference_genome]
classifications_order = CLASSIFICATIONS_ORDER[reference_genome]
neg_strand = NEGATIVE_STRAND[reference_genome]
db = np.zeros(genome_length, dtype = np.uintc)
with gzip.open(Path(input_file), 'rt') as f:
for row in f:
if not row.startswith('#'):
fields = row.strip().split('\t')
seg_type = fields[2]
if fields[2] != 'chromosome':
chromosome = chromosome_int[fields[0]]
start_pos = int(fields[3])
end_pos = int(fields[4])
strand = fields[6]
if strand == '+':
index = partial_sums[chromosome-1]
db[index+start_pos-1: index+end_pos] |= 2 ** classifications_order[seg_type]
elif strand == '-':
index = partial_sums[chromosome+neg_strand]
db[index+start_pos-1: index+end_pos] |= 2 ** classifications_order[seg_type]
else:
index1 = partial_sums[chromosome-1]
index2 = partial_sums[chromosome+neg_strand]
db[index1+start_pos-1: index1+end_pos] |= 2**classifications_order[seg_type]
db[index2+start_pos-1 :index2+end_pos] |= 2 ** classifications_order[seg_type]
f.close()
return db
def main():
"""
Create classifications DB in .npz file from the gff file.
The GFF path , the path where the DB will be saved and the genome_reference (37 or 38) are received as input from the user.
"""
parser = argparse.ArgumentParser()
parser.add_argument("genome_file", help="path to the genome file")
parser.add_argument("db_file", help="path to save the DB file")
parser.add_argument("genome_reference", type=int, help="37 for gh37 or 38 for gh38")
args = parser.parse_args()
db = init_classifications_db(args.genome_file, args.genome_reference)
np.savez_compressed(args.db_file, db)
if __name__ == '__main__':
main()