Skip to content

Commit 5c348ee

Browse files
authored
Merge pull request #374 from CanDIG/daisieh/optimize-search
DIG-2092: optimizing variant search
2 parents a11c83d + 5cb1eac commit 5c348ee

File tree

7 files changed

+45
-25
lines changed

7 files changed

+45
-25
lines changed

create_db.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,5 @@ psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_315.sql >>setup_out.tx
4444
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_339.sql >>setup_out.txt
4545
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_341.sql >>setup_out.txt
4646
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_352.sql >>setup_out.txt
47+
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_374.sql >>setup_out.txt
4748
echo "...done"

data/files.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,10 @@ CREATE TABLE variantfile (
100100
-- FOREIGN KEY(drs_object_id) REFERENCES drs_object (id)
101101
);
102102
CREATE TABLE pos_bucket (
103-
id SERIAL PRIMARY KEY,
103+
id SERIAL UNIQUE NOT NULL,
104104
pos_bucket_id INTEGER NOT NULL,
105105
contig_id VARCHAR,
106+
PRIMARY KEY (contig_id, pos_bucket_id),
106107
FOREIGN KEY(contig_id) REFERENCES contig (id)
107108
);
108109
CREATE TABLE header (

data/pr_374.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
DO
2+
$$
3+
BEGIN
4+
-- remove the old primary key on the serial id and make a composite pkey on
5+
-- contig_id + pos_bucket_id
6+
ALTER TABLE pos_bucket_variantfile_association DROP CONSTRAINT pos_bucket_variantfile_association_pos_bucket_id_fkey;
7+
ALTER TABLE pos_bucket DROP CONSTRAINT pos_bucket_pkey;
8+
ALTER TABLE pos_bucket ADD CONSTRAINT pos_bucket_unique UNIQUE (id);
9+
ALTER TABLE pos_bucket ALTER COLUMN id SET NOT NULL;
10+
ALTER TABLE pos_bucket ADD CONSTRAINT pos_bucket_pkey PRIMARY KEY (contig_id, pos_bucket_id);
11+
ALTER TABLE pos_bucket_variantfile_association ADD CONSTRAINT pos_bucket_variantfile_association_pos_bucket_id_fkey FOREIGN KEY (pos_bucket_id) REFERENCES pos_bucket (id);
12+
END;
13+
$$;

entrypoint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ if [[ -f "initial_setup" ]]; then
1111
sed -i s@\<AGGREGATE_COUNT_THRESHOLD\>@$AGGREGATE_COUNT_THRESHOLD@ config.ini
1212
sed -i s@\<POSTGRES_USERNAME\>@$POSTGRES_USERNAME@ config.ini
1313

14-
bash create_db.sh
1514
mkdir -p $INDEXING_PATH
1615
mkdir -p $SEARCH_PATH/results
1716
mkdir -p $SEARCH_PATH/to_search
1817
touch $INDEXING_SWITCH_FILE
1918
rm initial_setup
2019
fi
2120

21+
bash create_db.sh
2222
python -c "import candigv2_logging.logging
2323
candigv2_logging.logging.initialize()"
2424

htsget_server/beacon_operations.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ def full_beacon_search(search_json, headers=None):
365365

366366
try:
367367
variants_by_file = variants.find_variants_in_files(potential_hits, reference_name=actual_params['reference_name'], start=actual_params['start'], end=actual_params['end'], headers=headers)
368-
resultset = compile_beacon_resultset(variants_by_file, actual_params['reference_genome'], authed_programs)
368+
resultset = compile_beacon_resultset(variants_by_file, actual_params['reference_name'], actual_params['reference_genome'], authed_programs)
369369
except Exception as e:
370370
raise Exception(f"exception in compile_beacon_resultset for {actual_params}: {type(e)} {str(e)}")
371371
# others are for filtering after:
@@ -456,7 +456,7 @@ def full_beacon_search(search_json, headers=None):
456456
return response
457457

458458

459-
def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_programs=None):
459+
def compile_beacon_resultset(variants_by_obj, reference_name=None, reference_genome="hg38", authed_programs=None):
460460
"""
461461
Each beacon result describes a variation at a specific position:
462462
resultset = [
@@ -491,6 +491,10 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
491491
}
492492
]
493493
"""
494+
495+
# find the correct sequence_id for the chromosome:
496+
seqid = database.get_refseq_for_chromosome(reference_genome=reference_genome, contig=database.normalize_contig(reference_name))
497+
494498
resultset = {}
495499
for drs_obj in variants_by_obj.keys():
496500
# check to see if this drs_object is authorized:
@@ -507,7 +511,7 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
507511
continue
508512
for variant in variants_by_obj[drs_obj]['variants']:
509513
# parse the variants beacon-style
510-
variant['variations'] = compile_variations_from_record(ref=variant.pop('ref'), alt=variant.pop('alt'), chrom=variant.pop('chrom'), pos=variant.pop('pos'), reference_genome=reference_genome)
514+
variant['variations'] = compile_variations_from_record(ref=variant.pop('ref'), alt=variant.pop('alt'), pos=variant.pop('pos'), seqid=seqid)
511515
assign_info_to_variations(variant)
512516

513517
# the variations in each variant need to be copied out first:
@@ -594,7 +598,7 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
594598
return final_resultset
595599

596600

597-
def compile_variations_from_record(ref="", alt=[""], chrom="", pos="", reference_genome="hg38"):
601+
def compile_variations_from_record(ref="", alt=[""], pos="", seqid=None):
598602
start = int(pos)
599603
end = int(pos)
600604
variations = [
@@ -622,8 +626,6 @@ def compile_variations_from_record(ref="", alt=[""], chrom="", pos="", reference
622626
}
623627
]
624628

625-
# find the correct sequence_id for the chromosome:
626-
seqid = database.get_refseq_for_chromosome(reference_genome=reference_genome, contig=database.normalize_contig(chrom))
627629
hgvsid_base = ""
628630
if seqid is not None:
629631
variations[0]['location']['sequence_id'] = "refseq:" + seqid

htsget_server/database.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,11 @@ def __repr__(self):
163163

164164
class PositionBucket(ObjectDBBase):
165165
__tablename__ = 'pos_bucket'
166-
id = Column(Integer, primary_key=True)
167-
pos_bucket_id = Column(Integer) # each bucket contains 10 bp of positions
168-
166+
pos_bucket_id = Column(Integer, primary_key=True) # each bucket contains 10 bp of positions
169167
# a pos_bucket is part of a single contig
170-
contig_id = Column(String, ForeignKey('contig.id'))
168+
contig_id = Column(String, ForeignKey('contig.id'), primary_key=True)
169+
id = Column(Integer)
170+
171171
contig = relationship(
172172
"Contig",
173173
back_populates="pos_buckets",
@@ -180,7 +180,6 @@ class PositionBucket(ObjectDBBase):
180180
)
181181
def __repr__(self):
182182
result = {
183-
'id': self.id,
184183
'contig_id': self.contig_id,
185184
'pos_bucket_id': self.pos_bucket_id,
186185
'variantfiles': []
@@ -663,8 +662,9 @@ def search(obj, tries=1):
663662
try:
664663
with Session() as session:
665664
vfile = aliased(VariantFile)
666-
q = select(vfile.drs_object_id, vfile.reference_genome, PositionBucket.id, PositionBucket.pos_bucket_id).select_from(PositionBucket).join(vfile.associated_pos_buckets).join(vfile.associated_headers)
665+
q = select(vfile.drs_object_id, vfile.reference_genome, PositionBucket.id, PositionBucket.pos_bucket_id).select_from(PositionBucket).join(vfile.associated_pos_buckets)
667666
if 'headers' in obj:
667+
q = q.join(vfile.associated_headers)
668668
for header in obj['headers']:
669669
q = q.where(Header.text.like(f"%{header}%"))
670670
if 'region' in obj:

htsget_server/variants.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,23 +72,26 @@ def parse_vcf_file(drs_object_id, reference_name=None, start=None, end=None, hea
7272
variants_by_file['alt'] = headers.pop('ALT')
7373
if 'contig' in headers:
7474
variants_by_file['contig'] = headers.pop('contig')
75+
experiment_dict = {}
7576
for r in records:
7677
experiments = []
7778
for vcf_sample in r.samples:
7879
# samples in analysis_obj are listed as {vcf_sample: experiment_id}
7980
if "experiments" in analysis_obj and vcf_sample in analysis_obj['experiments']:
8081
experiment_id = analysis_obj['experiments'][vcf_sample]
81-
headers = {
82-
"X-Service-Token": create_service_token()
83-
}
84-
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{experiment_id}", headers=headers)
85-
experiment_obj = None
86-
if response.status_code == 200:
87-
experiment_obj = response.json()
88-
if experiment_obj is not None:
89-
experiments.append(experiment_obj["name"])
90-
else:
91-
experiments.append(experiment_id)
82+
if experiment_id not in experiment_dict:
83+
experiment_id = analysis_obj['experiments'][vcf_sample]
84+
headers = {
85+
"X-Service-Token": create_service_token()
86+
}
87+
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{experiment_id}", headers=headers)
88+
experiment_obj = None
89+
if response.status_code == 200:
90+
experiment_obj = response.json()
91+
experiment_dict[experiment_id] = experiment_obj["name"]
92+
else:
93+
experiment_dict[experiment_id] = experiment_id
94+
experiments.append(experiment_dict[experiment_id])
9295
else:
9396
experiments.append(vcf_sample)
9497
variant_record = parse_variant_record(str(r), experiments, variants_by_file['info'])

0 commit comments

Comments
 (0)