Skip to content

Commit d1750f4

Browse files
authored
Merge pull request #388 from CanDIG/daisieh/vcf_date
DIG-2080: find analysis date in vcf files
2 parents fa621c1 + 11cf00e commit d1750f4

File tree

8 files changed

+133
-17
lines changed

8 files changed

+133
-17
lines changed

create_db.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,5 @@ psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_339.sql >>setup_out.tx
4545
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_341.sql >>setup_out.txt
4646
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_352.sql >>setup_out.txt
4747
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_374.sql >>setup_out.txt
48+
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_388.sql >>setup_out.txt
4849
echo "...done"

data/files.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ CREATE TABLE variantfile (
9696
indexed INTEGER,
9797
chr_prefix VARCHAR,
9898
reference_genome VARCHAR,
99+
analysis_date DATE,
99100
PRIMARY KEY (id)
100101
-- FOREIGN KEY(drs_object_id) REFERENCES drs_object (id)
101102
);

data/pr_388.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- add the analysis date column if it doesn't already exist
2+
DO
3+
$$
4+
BEGIN
5+
ALTER TABLE variantfile ADD COLUMN IF NOT EXISTS analysis_date DATE;
6+
END;
7+
$$;

htsget_server/analysis_date.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import htsget_operations
2+
import database
3+
import os
4+
import sys
5+
import hashlib
6+
import re
7+
import datetime
8+
from candigv2_logging.logging import initialize, CanDIGLogger
9+
import requests
10+
from authx.auth import create_service_token
11+
import json
12+
from server import Session
13+
from sqlalchemy import Column, Integer, String, JSON, Boolean, MetaData, Date, ForeignKey, Table, select
14+
15+
16+
logger = CanDIGLogger(__file__)
17+
18+
initialize()
19+
20+
21+
def find_analysis_dates():
22+
service_headers = {
23+
"X-Service-Token": create_service_token()
24+
}
25+
26+
headers_by_vf = {}
27+
with Session() as session:
28+
q = select(database.VariantFile.id, database.Header.text)
29+
q = q.join(database.VariantFile, database.Header.associated_variantfiles)
30+
q = q.where(database.Header.text.contains("%ate=%")).order_by(database.VariantFile.id)
31+
for row in session.execute(q):
32+
result = row._mapping
33+
if result["id"] not in headers_by_vf:
34+
headers_by_vf[result["id"]] = []
35+
headers_by_vf[result["id"]].append(result["text"])
36+
37+
result = {
38+
"date_added_to": [],
39+
"errors": []
40+
}
41+
for vf in headers_by_vf:
42+
analysis_date = database.get_analysis_date_from_headers(headers_by_vf[vf])
43+
if "datetime" in str(type(analysis_date)):
44+
analysis_date = analysis_date.strftime("%Y-%m-%d")
45+
response = write_analysis_date(vf, service_headers, analysis_date)
46+
if response.status_code == 200:
47+
result["date_added_to"].append({vf: analysis_date})
48+
else:
49+
result["errors"].append(f"{vf}: {response.status_code} {response.text}")
50+
51+
return result
52+
53+
def write_analysis_date(drs_obj_id, headers, analysis_date):
54+
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{drs_obj_id}", headers=headers)
55+
if response.status_code == 200:
56+
obj = response.json()
57+
obj["metadata"]["analysis_date"] = analysis_date
58+
response = requests.post(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects", headers=headers, json=obj)
59+
return response
60+
return response
61+
62+
63+
if __name__ == "__main__":
64+
print(json.dumps(find_analysis_dates(), indent=2))

htsget_server/database.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from sqlalchemy.orm import relationship, aliased
2-
from sqlalchemy import Column, Integer, String, JSON, Boolean, MetaData, ForeignKey, Table, select
2+
from sqlalchemy import Column, Integer, String, JSON, Boolean, MetaData, Date, ForeignKey, Table, select
33
import json
44
import os
55
import re
6-
from datetime import datetime
6+
import dateparser
7+
import dateparser.search
8+
from datetime import datetime, date
79
from random import randint
810
from time import sleep
911
from config import BUCKET_SIZE, HTSGET_URL, MAX_TRIES
@@ -117,6 +119,7 @@ class VariantFile(ObjectDBBase):
117119
indexed = Column(Integer)
118120
chr_prefix = Column(String)
119121
reference_genome = Column(String)
122+
analysis_date = Column(Date)
120123

121124
# a variantfile maps to a drs object
122125
drs_object_id = Column(String)
@@ -156,7 +159,8 @@ def __repr__(self):
156159
}
157160
for sample in self.samples:
158161
result['samples'].append(sample.sample_id)
159-
162+
if self.analysis_date is not None:
163+
result['analysis_date'] = self.analysis_date.strftime("%Y-%m-%d")
160164
return json.dumps(result)
161165

162166

@@ -328,6 +332,7 @@ def create_variantfile(obj, tries=1):
328332
new_variantfile = VariantFile()
329333
new_variantfile.indexed = 0
330334
new_variantfile.chr_prefix = ''
335+
new_variantfile.analysis_date = None
331336
new_variantfile.id = obj['id']
332337
new_variantfile.reference_genome = obj['reference_genome']
333338
headers = {
@@ -455,6 +460,11 @@ def add_header_for_variantfile(obj):
455460
headertexts = map(lambda x: x.strip(), obj['texts'])
456461
with Session() as session:
457462
new_variantfile = session.query(VariantFile).filter_by(id=obj['variantfile_id']).one_or_none()
463+
analysis_date = get_analysis_date_from_headers(headertexts)
464+
# save the analysis date
465+
new_variantfile.analysis_date = analysis_date
466+
session.add(new_variantfile)
467+
458468
for headertext in headertexts:
459469
if headertext == '' or headertext.startswith("#CHROM"):
460470
continue
@@ -466,8 +476,9 @@ def add_header_for_variantfile(obj):
466476
new_header.text = headertext
467477
new_header.associated_variantfiles.append(new_variantfile)
468478
session.add(new_header)
479+
469480
session.commit()
470-
return None
481+
return json.loads(str(new_variantfile))
471482

472483

473484
def delete_header(text):
@@ -478,6 +489,33 @@ def delete_header(text):
478489
return json.loads(str(new_object))
479490

480491

492+
def get_analysis_date_from_headers(headertexts):
493+
possible_dates = []
494+
for headertext in headertexts:
495+
# look for datelike things
496+
date_parse = re.match(r"(.+[Dd]ate)=(.+)", headertext)
497+
if date_parse is not None:
498+
if date_parse.group(1) == "##fileDate":
499+
possible_dates.insert(0, date_parse.group(2))
500+
else:
501+
possible_dates.append(date_parse.group(2))
502+
503+
# process datelike things
504+
logger.debug(possible_dates)
505+
analysis_date = None
506+
while len(possible_dates) > 0:
507+
possible_date = possible_dates.pop(0)
508+
analysis_date = dateparser.parse(possible_date, date_formats=['%Y%m%d'])
509+
if analysis_date is None:
510+
analysis_date = dateparser.search.search_dates(possible_date)
511+
if analysis_date is not None:
512+
analysis_date = analysis_date[0][1]
513+
if analysis_date is not None:
514+
logger.debug(analysis_date)
515+
return analysis_date
516+
return None
517+
518+
481519
# for efficiency, positions are bucketed into 10 bp sets: pos_bucket_id == base pair position/10, rounded down
482520
def get_bucket_for_position(pos):
483521
return int(pos/BUCKET_SIZE) * BUCKET_SIZE

htsget_server/htsget_operations.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,9 @@ def _verify_analysis_drs_object(id_):
597597
# the AnalysisDrsObject's listed ExperimentContentsObjects should match the samples in the VCF file.
598598
if len(test) > 0:
599599
raise Exception(f"AnalysisDrsObject {id_} lists experiments {test} that are not in the linked analysis file")
600+
# variant files should have an analysis_date
601+
if database.get_analysis_date_from_headers(str(gen_obj['file'].header).split('\n')) is None:
602+
raise Exception(f"AnalysisDrsObject {id_} does not have any associated analysis date")
600603
else:
601604
# for read files, we can test whether the linked file is readable by checking for references in the header.
602605
try:

htsget_server/indexing.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121

2222

2323
def index_variants(drs_obj_id, program):
24-
headers = {
24+
service_headers = {
2525
"X-Service-Token": create_service_token()
2626
}
2727

28-
gen_obj = htsget_operations.get_pysam_obj(drs_obj_id, headers=headers)
28+
gen_obj = htsget_operations.get_pysam_obj(drs_obj_id, headers=service_headers)
2929
if gen_obj is None:
3030
return {"message": f"No id {drs_obj_id} exists"}, 404
3131
if "message" in gen_obj:
@@ -35,13 +35,20 @@ def index_variants(drs_obj_id, program):
3535
return {"message": f"Read object {drs_obj_id} stats calculated"}, 200
3636

3737
logger.info(f"{drs_obj_id} starting indexing")
38-
write_index_status(drs_obj_id, f"{datetime.datetime.today()} starting indexing")
38+
write_index_status(drs_obj_id, service_headers, f"{datetime.datetime.today()} starting indexing")
3939

4040
headers = str(gen_obj['file'].header).split('\n')
4141

42-
database.add_header_for_variantfile({'texts': headers, 'variantfile_id': drs_obj_id})
42+
variantfile = database.add_header_for_variantfile({'texts': headers, 'variantfile_id': drs_obj_id})
4343
logger.info(f"{drs_obj_id} indexed {len(headers)} headers")
4444

45+
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{drs_obj_id}", headers=service_headers)
46+
if response.status_code == 200:
47+
obj = response.json()
48+
if "analysis_date" not in obj["metadata"] and "analysis_date" in variantfile:
49+
obj["metadata"]["analysis_date"] = variantfile["analysis_date"]
50+
response = requests.post(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects", headers=service_headers, json=obj)
51+
4552
samples = list(gen_obj['file'].header.samples)
4653
for sample in samples:
4754
if database.create_sample({'id': sample, 'variantfile_id': drs_obj_id}) is None:
@@ -74,16 +81,13 @@ def index_variants(drs_obj_id, program):
7481

7582
logger.info(f"{drs_obj_id} writing {len(res['bucket_counts'])} entries to db")
7683
write_pos_bucket(res, drs_obj_id)
77-
mark_as_indexed(drs_obj_id)
84+
mark_as_indexed(drs_obj_id, service_headers)
7885
logger.info(f"{drs_obj_id} indexing done")
7986

8087
return {"message": f"Indexing complete for variantfile {drs_obj_id}"}, 200
8188

8289

83-
def mark_as_indexed(drs_obj_id):
84-
headers = {
85-
"X-Service-Token": create_service_token()
86-
}
90+
def mark_as_indexed(drs_obj_id, headers):
8791
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{drs_obj_id}", headers=headers)
8892
if response.status_code == 200:
8993
obj = response.json()
@@ -166,10 +170,7 @@ def index_touch_file(file_path):
166170
logger.warning(f"indexing error! {type(e)} {str(e)}")
167171

168172

169-
def write_index_status(drs_obj_id, message):
170-
headers = {
171-
"X-Service-Token": create_service_token()
172-
}
173+
def write_index_status(drs_obj_id, headers, message):
173174
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{drs_obj_id}", headers=headers)
174175
if response.status_code == 200:
175176
obj = response.json()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ werkzeug>=3.1.0 # not directly required, pinned by Snyk to avoid a vulnerability
1717
zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
1818
urllib3>=2.2.2 # not directly required, pinned by Snyk to avoid a vulnerability
1919
requests==2.32.4
20+
dateparser>=1.2.2

0 commit comments

Comments
 (0)