11from sqlalchemy .orm import relationship , aliased
2- from sqlalchemy import Column , Integer , String , JSON , Boolean , MetaData , ForeignKey , Table , select
2+ from sqlalchemy import Column , Integer , String , JSON , Boolean , MetaData , Date , ForeignKey , Table , select
33import json
44import os
55import re
6- from datetime import datetime
6+ import dateparser
7+ import dateparser .search
8+ from datetime import datetime , date
79from random import randint
810from time import sleep
911from config import BUCKET_SIZE , HTSGET_URL , MAX_TRIES
@@ -117,6 +119,7 @@ class VariantFile(ObjectDBBase):
117119 indexed = Column (Integer )
118120 chr_prefix = Column (String )
119121 reference_genome = Column (String )
122+ analysis_date = Column (Date )
120123
121124 # a variantfile maps to a drs object
122125 drs_object_id = Column (String )
@@ -156,7 +159,8 @@ def __repr__(self):
156159 }
157160 for sample in self .samples :
158161 result ['samples' ].append (sample .sample_id )
159-
162+ if self .analysis_date is not None :
163+ result ['analysis_date' ] = self .analysis_date .strftime ("%Y-%m-%d" )
160164 return json .dumps (result )
161165
162166
@@ -328,6 +332,7 @@ def create_variantfile(obj, tries=1):
328332 new_variantfile = VariantFile ()
329333 new_variantfile .indexed = 0
330334 new_variantfile .chr_prefix = ''
335+ new_variantfile .analysis_date = None
331336 new_variantfile .id = obj ['id' ]
332337 new_variantfile .reference_genome = obj ['reference_genome' ]
333338 headers = {
@@ -455,6 +460,11 @@ def add_header_for_variantfile(obj):
455460 headertexts = map (lambda x : x .strip (), obj ['texts' ])
456461 with Session () as session :
457462 new_variantfile = session .query (VariantFile ).filter_by (id = obj ['variantfile_id' ]).one_or_none ()
463+ analysis_date = get_analysis_date_from_headers (headertexts )
464+ # save the analysis date
465+ new_variantfile .analysis_date = analysis_date
466+ session .add (new_variantfile )
467+
458468 for headertext in headertexts :
459469 if headertext == '' or headertext .startswith ("#CHROM" ):
460470 continue
@@ -466,8 +476,9 @@ def add_header_for_variantfile(obj):
466476 new_header .text = headertext
467477 new_header .associated_variantfiles .append (new_variantfile )
468478 session .add (new_header )
479+
469480 session .commit ()
470- return None
481+ return json . loads ( str ( new_variantfile ))
471482
472483
473484def delete_header (text ):
@@ -478,6 +489,33 @@ def delete_header(text):
478489 return json .loads (str (new_object ))
479490
480491
492+ def get_analysis_date_from_headers (headertexts ):
493+ possible_dates = []
494+ for headertext in headertexts :
495+ # look for datelike things
496+ date_parse = re .match (r"(.+[Dd]ate)=(.+)" , headertext )
497+ if date_parse is not None :
498+ if date_parse .group (1 ) == "##fileDate" :
499+ possible_dates .insert (0 , date_parse .group (2 ))
500+ else :
501+ possible_dates .append (date_parse .group (2 ))
502+
503+ # process datelike things
504+ logger .debug (possible_dates )
505+ analysis_date = None
506+ while len (possible_dates ) > 0 :
507+ possible_date = possible_dates .pop (0 )
508+ analysis_date = dateparser .parse (possible_date , date_formats = ['%Y%m%d' ])
509+ if analysis_date is None :
510+ analysis_date = dateparser .search .search_dates (possible_date )
511+ if analysis_date is not None :
512+ analysis_date = analysis_date [0 ][1 ]
513+ if analysis_date is not None :
514+ logger .debug (analysis_date )
515+ return analysis_date
516+ return None
517+
518+
481519# for efficiency, positions are bucketed into 10 bp sets: pos_bucket_id == base pair position/10, rounded down
482520def get_bucket_for_position (pos ):
483521 return int (pos / BUCKET_SIZE ) * BUCKET_SIZE
0 commit comments