Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions create_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_315.sql >>setup_out.tx
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_339.sql >>setup_out.txt
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_341.sql >>setup_out.txt
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_352.sql >>setup_out.txt
psql --quiet -h "$db" -U $PGUSER -d genomic -a -f data/pr_374.sql >>setup_out.txt
echo "...done"
3 changes: 2 additions & 1 deletion data/files.sql
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,10 @@ CREATE TABLE variantfile (
-- FOREIGN KEY(drs_object_id) REFERENCES drs_object (id)
);
CREATE TABLE pos_bucket (
id SERIAL PRIMARY KEY,
id SERIAL UNIQUE NOT NULL,
pos_bucket_id INTEGER NOT NULL,
contig_id VARCHAR,
PRIMARY KEY (contig_id, pos_bucket_id),
FOREIGN KEY(contig_id) REFERENCES contig (id)
);
CREATE TABLE header (
Expand Down
13 changes: 13 additions & 0 deletions data/pr_374.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
DO
$$
BEGIN
-- remove the old primary key on the serial id and make a composite pkey on
-- contig_id + pos_bucket_id
ALTER TABLE pos_bucket_variantfile_association DROP CONSTRAINT pos_bucket_variantfile_association_pos_bucket_id_fkey;
ALTER TABLE pos_bucket DROP CONSTRAINT pos_bucket_pkey;
ALTER TABLE pos_bucket ADD CONSTRAINT pos_bucket_unique UNIQUE (id);
ALTER TABLE pos_bucket ALTER COLUMN id SET NOT NULL;
ALTER TABLE pos_bucket ADD CONSTRAINT pos_bucket_pkey PRIMARY KEY (contig_id, pos_bucket_id);
ALTER TABLE pos_bucket_variantfile_association ADD CONSTRAINT pos_bucket_variantfile_association_pos_bucket_id_fkey FOREIGN KEY (pos_bucket_id) REFERENCES pos_bucket (id);
END;
$$;
2 changes: 1 addition & 1 deletion entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ if [[ -f "initial_setup" ]]; then
sed -i s@\<AGGREGATE_COUNT_THRESHOLD\>@$AGGREGATE_COUNT_THRESHOLD@ config.ini
sed -i s@\<POSTGRES_USERNAME\>@$POSTGRES_USERNAME@ config.ini

bash create_db.sh
mkdir -p $INDEXING_PATH
mkdir -p $SEARCH_PATH/results
mkdir -p $SEARCH_PATH/to_search
touch $INDEXING_SWITCH_FILE
rm initial_setup
fi

bash create_db.sh
python -c "import candigv2_logging.logging
candigv2_logging.logging.initialize()"

Expand Down
14 changes: 8 additions & 6 deletions htsget_server/beacon_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def full_beacon_search(search_json, headers=None):

try:
variants_by_file = variants.find_variants_in_files(potential_hits, reference_name=actual_params['reference_name'], start=actual_params['start'], end=actual_params['end'], headers=headers)
resultset = compile_beacon_resultset(variants_by_file, actual_params['reference_genome'], authed_programs)
resultset = compile_beacon_resultset(variants_by_file, actual_params['reference_name'], actual_params['reference_genome'], authed_programs)
except Exception as e:
raise Exception(f"exception in compile_beacon_resultset for {actual_params}: {type(e)} {str(e)}")
# others are for filtering after:
Expand Down Expand Up @@ -456,7 +456,7 @@ def full_beacon_search(search_json, headers=None):
return response


def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_programs=None):
def compile_beacon_resultset(variants_by_obj, reference_name=None, reference_genome="hg38", authed_programs=None):
"""
Each beacon result describes a variation at a specific position:
resultset = [
Expand Down Expand Up @@ -491,6 +491,10 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
}
]
"""

# find the correct sequence_id for the chromosome:
seqid = database.get_refseq_for_chromosome(reference_genome=reference_genome, contig=database.normalize_contig(reference_name))

resultset = {}
for drs_obj in variants_by_obj.keys():
# check to see if this drs_object is authorized:
Expand All @@ -507,7 +511,7 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
continue
for variant in variants_by_obj[drs_obj]['variants']:
# parse the variants beacon-style
variant['variations'] = compile_variations_from_record(ref=variant.pop('ref'), alt=variant.pop('alt'), chrom=variant.pop('chrom'), pos=variant.pop('pos'), reference_genome=reference_genome)
variant['variations'] = compile_variations_from_record(ref=variant.pop('ref'), alt=variant.pop('alt'), pos=variant.pop('pos'), seqid=seqid)
assign_info_to_variations(variant)

# the variations in each variant need to be copied out first:
Expand Down Expand Up @@ -594,7 +598,7 @@ def compile_beacon_resultset(variants_by_obj, reference_genome="hg38", authed_pr
return final_resultset


def compile_variations_from_record(ref="", alt=[""], chrom="", pos="", reference_genome="hg38"):
def compile_variations_from_record(ref="", alt=[""], pos="", seqid=None):
start = int(pos)
end = int(pos)
variations = [
Expand Down Expand Up @@ -622,8 +626,6 @@ def compile_variations_from_record(ref="", alt=[""], chrom="", pos="", reference
}
]

# find the correct sequence_id for the chromosome:
seqid = database.get_refseq_for_chromosome(reference_genome=reference_genome, contig=database.normalize_contig(chrom))
hgvsid_base = ""
if seqid is not None:
variations[0]['location']['sequence_id'] = "refseq:" + seqid
Expand Down
12 changes: 6 additions & 6 deletions htsget_server/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,11 @@ def __repr__(self):

class PositionBucket(ObjectDBBase):
__tablename__ = 'pos_bucket'
id = Column(Integer, primary_key=True)
pos_bucket_id = Column(Integer) # each bucket contains 10 bp of positions

pos_bucket_id = Column(Integer, primary_key=True) # each bucket contains 10 bp of positions
# a pos_bucket is part of a single contig
contig_id = Column(String, ForeignKey('contig.id'))
contig_id = Column(String, ForeignKey('contig.id'), primary_key=True)
id = Column(Integer)

contig = relationship(
"Contig",
back_populates="pos_buckets",
Expand All @@ -180,7 +180,6 @@ class PositionBucket(ObjectDBBase):
)
def __repr__(self):
result = {
'id': self.id,
'contig_id': self.contig_id,
'pos_bucket_id': self.pos_bucket_id,
'variantfiles': []
Expand Down Expand Up @@ -663,8 +662,9 @@ def search(obj, tries=1):
try:
with Session() as session:
vfile = aliased(VariantFile)
q = select(vfile.drs_object_id, vfile.reference_genome, PositionBucket.id, PositionBucket.pos_bucket_id).select_from(PositionBucket).join(vfile.associated_pos_buckets).join(vfile.associated_headers)
q = select(vfile.drs_object_id, vfile.reference_genome, PositionBucket.id, PositionBucket.pos_bucket_id).select_from(PositionBucket).join(vfile.associated_pos_buckets)
if 'headers' in obj:
q = q.join(vfile.associated_headers)
for header in obj['headers']:
q = q.where(Header.text.like(f"%{header}%"))
if 'region' in obj:
Expand Down
25 changes: 14 additions & 11 deletions htsget_server/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,26 @@ def parse_vcf_file(drs_object_id, reference_name=None, start=None, end=None, hea
variants_by_file['alt'] = headers.pop('ALT')
if 'contig' in headers:
variants_by_file['contig'] = headers.pop('contig')
experiment_dict = {}
for r in records:
experiments = []
for vcf_sample in r.samples:
# samples in analysis_obj are listed as {vcf_sample: experiment_id}
if "experiments" in analysis_obj and vcf_sample in analysis_obj['experiments']:
experiment_id = analysis_obj['experiments'][vcf_sample]
headers = {
"X-Service-Token": create_service_token()
}
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{experiment_id}", headers=headers)
experiment_obj = None
if response.status_code == 200:
experiment_obj = response.json()
if experiment_obj is not None:
experiments.append(experiment_obj["name"])
else:
experiments.append(experiment_id)
if experiment_id not in experiment_dict:
experiment_id = analysis_obj['experiments'][vcf_sample]
headers = {
"X-Service-Token": create_service_token()
}
response = requests.get(url=f"{os.getenv("DRS_URL")}/ga4gh/drs/v1/objects/{experiment_id}", headers=headers)
experiment_obj = None
if response.status_code == 200:
experiment_obj = response.json()
experiment_dict[experiment_id] = experiment_obj["name"]
else:
experiment_dict[experiment_id] = experiment_id
experiments.append(experiment_dict[experiment_id])
else:
experiments.append(vcf_sample)
variant_record = parse_variant_record(str(r), experiments, variants_by_file['info'])
Expand Down