diff --git a/datachecks/conftest.py b/datachecks/conftest.py
deleted file mode 100755
index b583798a..00000000
--- a/datachecks/conftest.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# See the NOTICE file distributed with this work for additional information
-# regarding copyright ownership.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import pyBigWig
-from cyvcf2 import VCF
-import logging
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-def pytest_addoption(parser):
- """Add custom pytest command-line options.
-
- Adds options to supply paths to vcf, bigbed, bigwig, source_vcf and species.
- """
- parser.addoption("--vcf", type=str, default=None)
- parser.addoption("--bigbed", type=str, default=None)
- parser.addoption("--bigwig", type=str, default=None)
- parser.addoption("--source_vcf", type=str, default=None)
- parser.addoption("--species", type=str, default=None)
-
-
-def pytest_generate_tests(metafunc):
- """Parametrise fixtures from provided command-line options.
-
- For each known fixture name, set a single parameterised value from the CLI option.
- """
- if "vcf" in metafunc.fixturenames:
- metafunc.parametrize("vcf", [metafunc.config.getoption("vcf")])
- if "bigbed" in metafunc.fixturenames:
- metafunc.parametrize("bigbed", [metafunc.config.getoption("bigbed")])
- if "bigwig" in metafunc.fixturenames:
- metafunc.parametrize("bigwig", [metafunc.config.getoption("bigwig")])
- if "source_vcf" in metafunc.fixturenames:
- metafunc.parametrize("source_vcf", [metafunc.config.getoption("source_vcf")])
- if "species" in metafunc.fixturenames:
- metafunc.parametrize("species", [metafunc.config.getoption("species")])
-
-
-@pytest.fixture()
-def vcf_reader(vcf):
- """Provide a cyvcf2 VCF reader for the supplied VCF file path."""
- vcf_reader = VCF(vcf)
- return vcf_reader
-
-
-@pytest.fixture()
-def bb_reader(bigbed):
- """Provide a pyBigWig reader opened on a bigBed file path."""
- bb_reader = pyBigWig.open(bigbed)
- return bb_reader
-
-
-@pytest.fixture()
-def bw_reader(bigwig):
- """Provide a pyBigWig reader opened on a bigWig file path."""
- bw_reader = pyBigWig.open(bigwig)
- return bw_reader
diff --git a/datachecks/run_datachecks.py b/datachecks/run_datachecks.py
index 162aeca0..65fce3f6 100755
--- a/datachecks/run_datachecks.py
+++ b/datachecks/run_datachecks.py
@@ -43,22 +43,18 @@ def parse_args(args=None):
"""
parser = argparse.ArgumentParser()
- parser.add_argument("--dir", dest="dir", type=str, default=os.getcwd())
+ parser.add_argument("--dir", dest="dir", type=str, default=os.getcwd(), help="Directory containining pipeline output files")
parser.add_argument("--input_config", dest="input_config", type=str)
parser.add_argument(
- "-O", "--output_dir", dest="output_dir", type=str, default=os.getcwd()
+ "-O", "--output_dir", dest="output_dir", type=str, default=os.path.join(os.getcwd(), "output")
)
- parser.add_argument("-M", "--mem", dest="memory", type=str, default="2000")
- parser.add_argument("-t", "--time", dest="time", type=str, default="01:00:00")
+ parser.add_argument("-M", "--mem", dest="memory", type=str, default="6000")
+ parser.add_argument("-t", "--time", dest="time", type=str, default="05:00:00")
parser.add_argument(
"-p", "--partition", dest="partition", type=str, default="production"
)
- parser.add_argument(
- "--mail-user",
- dest="mail_user",
- type=str,
- default=getpass.getuser() + "@ebi.ac.uk",
- )
+ parser.add_argument("--mail-user", dest="mail_user", type=str)
+ parser.add_argument("--skip-xfail", dest="skip_xfail", action="store_true")
parser.add_argument(type=str, nargs="?", dest="tests", default="./")
return parser.parse_args(args)
@@ -134,15 +130,16 @@ def main(args=None):
input_config = args.input_config or None
dir = args.dir
output_dir = args.output_dir
+ tmp_dir = os.path.join(os.getcwd(), "tmp")
# create output directory if not already exist
os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(tmp_dir, exist_ok=True)
species_metadata = {}
if input_config is not None:
species_metadata = get_species_metadata(input_config)
- vcf_files = []
api_outdir = os.path.join(dir, "api")
track_outdir = os.path.join(dir, "tracks")
for genome_uuid in os.listdir(api_outdir):
@@ -159,20 +156,26 @@ def main(args=None):
bigbed = os.path.join(track_outdir, genome_uuid, "variant-details.bb")
bigwig = os.path.join(track_outdir, genome_uuid, "variant-summary.bw")
- timestamp = int(datetime.datetime.now().timestamp())
- with open(f"dc_{timestamp}.sh", "w") as file:
+ skip_xfail_arg = ""
+ if args.skip_xfail:
+ skip_xfail_arg = "--skip_xfail"
+
+ script_file = os.path.join(tmp_dir, f"dc_{species}.sh")
+ with open(script_file, "w") as file:
file.write("#!/bin/bash\n\n")
file.write(f"#SBATCH --time={args.time}\n")
file.write(f"#SBATCH --mem={args.memory}\n")
file.write(f"#SBATCH --partition={args.partition}\n")
- file.write(f"#SBATCH --mail-user={args.mail_user}\n")
- file.write(f"#SBATCH --mail-type=END\n")
- file.write(f"#SBATCH --mail-type=FAIL\n")
+ if args.mail_user is not None:
+ file.write(f"#SBATCH --mail-user={args.mail_user}\n")
+ file.write("#SBATCH --mail-type=END\n")
+ file.write("#SBATCH --mail-type=FAIL\n")
file.write("\n")
+ file.write("module load bcftools\n")
file.write(
- f"pytest --source_vcf={source_vcf} --bigbed={bigbed} --bigwig={bigwig} --vcf={vcf} --species={species} {args.tests}\n"
+ f"pytest --tap --tb=short {skip_xfail_arg} --source_vcf={source_vcf} --bigbed={bigbed} --bigwig={bigwig} --vcf={vcf} --species={species} {args.tests}\n"
)
subprocess.run(
@@ -184,27 +187,10 @@ def main(args=None):
f"{output_dir}/dc_{species}.out",
"--error",
f"{output_dir}/dc_{species}.err",
- f"dc_{timestamp}.sh",
+ script_file,
]
)
- # subprocess.run([
- # "bsub",
- # f"-M{args.memory}",
- # "-q", f"{args.parition}",
- # "-J", f"dc_{species}",
- # "-oo", f"{output_dir}/dc_{species}.out",
- # "-eo", f"{output_dir}/dc_{species}.err",
- # f"pytest " + \
- # f"--source_vcf={source_vcf} " + \
- # f"--bigbed={bigbed} " + \
- # f"--bigwig={bigwig} " + \
- # f"--vcf={vcf} " + \
- # f"--species={species} " + \
- # f"{args.tests}"
- # ]
- # )
-
if __name__ == "__main__":
sys.exit(main())
diff --git a/datachecks/test_vcf.py b/datachecks/test_vcf.py
deleted file mode 100755
index 1e77d9ca..00000000
--- a/datachecks/test_vcf.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# See the NOTICE file distributed with this work for additional information
-# regarding copyright ownership.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import os
-from cyvcf2 import VCF
-from cyvcf2.cyvcf2 import Variant
-from typing import Callable
-import subprocess
-import random
-from math import isclose
-import logging
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-# default: "empty_value" = True
-# default: "field_existance" = ["homo_sapiens", "homo_sapiens_37"]
-CSQ_FIELDS = {
- "Allele": {"empty_value": False, "field_existance": "all"},
- "Consequence": {"empty_value": False, "field_existance": "all"},
- "Feature": {"field_existance": "all"},
- "Feature": {"field_existance": "all"},
- "VARIANT_CLASS": {"empty_value": False, "field_existance": "all"},
- "SPDI": {"empty_value": False, "field_existance": "all"},
- "PUBMED": {"field_existance": "all"},
- "VAR_SYNONYMS": {"field_existance": "all"},
- "PHENOTYPES": {},
- "Conservation": {"field_existance": "homo_sapiens"},
- "CADD_PHRED": {},
- "AA": {},
- "SIFT": {},
- "PolyPhen": {},
- "gnomAD_exomes_AF": {},
- "gnomAD_exomes_AC": {},
- "gnomAD_exomes_AN": {},
- "gnomAD_exomes_AF_afr": {},
- "gnomAD_exomes_AC_afr": {},
- "gnomAD_exomes_AN_afr": {},
- "gnomAD_exomes_AF_amr": {},
- "gnomAD_exomes_AC_amr": {},
- "gnomAD_exomes_AN_amr": {},
- "gnomAD_exomes_AF_asj": {},
- "gnomAD_exomes_AC_asj": {},
- "gnomAD_exomes_AN_asj": {},
- "gnomAD_exomes_AF_eas": {},
- "gnomAD_exomes_AC_eas": {},
- "gnomAD_exomes_AN_eas": {},
- "gnomAD_exomes_AF_fin": {},
- "gnomAD_exomes_AC_fin": {},
- "gnomAD_exomes_AN_fin": {},
- "gnomAD_exomes_AF_nfe": {},
- "gnomAD_exomes_AC_nfe": {},
- "gnomAD_exomes_AN_nfe": {},
- "gnomAD_exomes_AF_oth": {},
- "gnomAD_exomes_AC_oth": {},
- "gnomAD_exomes_AN_oth": {},
- "gnomAD_exomes_AF_sas": {},
- "gnomAD_exomes_AC_sas": {},
- "gnomAD_exomes_AN_sas": {},
- "gnomAD_genomes_AF": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_afr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_afr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_afr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_amr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_amr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_amr": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_asj": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_asj": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_asj": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_eas": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_eas": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_eas": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_fin": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_fin": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_fin": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_nfe": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_nfe": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_nfe": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_oth": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_oth": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_oth": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AF_sas": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AC_sas": {"field_existance": "homo_sapiens"},
- "gnomAD_genomes_AN_sas": {"field_existance": "homo_sapiens"},
- "AF": {},
- "AFR_AF": {},
- "AMR_AF": {},
- "EAS_AF": {},
- "EUR_AF": {},
- "SAS_AF": {},
-}
-
-
-class TestFile:
- def test_exist(self, vcf):
- """Assert that the provided VCF path exists on disk.
-
- Args:
- vcf (str): Path to the VCF file passed via pytest options.
-
- Raises:
- AssertionError: If the file does not exist.
- """
- assert os.path.isfile(vcf)
-
-
-class TestHeader:
- def test_file_format(self, vcf_reader):
- """Assert VCF file defines a fileformat header entry."""
- assert vcf_reader.get_header_type("fileformat")
-
- def test_header_line(self, vcf_reader):
- """Assert the VCF header contains the mandatory CHROM/POS/ID/REF/ALT columns."""
- header_line = vcf_reader.raw_header.split("\n")[-2]
- assert header_line.startswith("#CHROM\tPOS\tID\tREF\tALT")
-
- def test_info_csq(self, vcf_reader, species):
- """Verify CSQ header exists and contains expected fields for the species.
-
- Args:
- vcf_reader: cyvcf2 VCF reader.
- species (str): Species production name passed to the tests.
- """
- assert vcf_reader.get_header_type("CSQ")
-
- csq_info_description = vcf_reader.get_header_type("CSQ")["Description"].strip(
- '"'
- )
- prefix = "Consequence annotations from Ensembl VEP. Format: "
- csq_list = [
- csq.strip() for csq in csq_info_description[len(prefix) :].split("|")
- ]
-
- for csq_field in CSQ_FIELDS:
- if "field_existance" in CSQ_FIELDS[csq_field]:
- field_existance = CSQ_FIELDS[csq_field]["field_existance"]
- else:
- field_existance = ["homo_sapiens", "homo_sapiens_37"]
-
- if type(field_existance) is str and (
- field_existance == "all" or field_existance == species
- ):
- assert csq_field in csq_list
- elif type(field_existance) is list and species in field_existance:
- assert csq_field in csq_list
- else:
- logger.info(f"{csq_field} exist - {csq_field in csq_list}")
-
-
-class TestDuplicate:
- def get_positioned_id(self, variant: Variant) -> str:
- """Return a positioned identifier for a variant (chrom:pos:id).
-
- Args:
- variant (Variant): cyvcf2 Variant object.
-
- Returns:
- str: Formatted positioned identifier.
- """
- id = variant.ID or "unknown"
- return variant.CHROM + ":" + str(variant.POS) + ":" + id
-
- def get_id(self, variant: Variant) -> str:
- """Return the raw variant ID.
-
- Args:
- variant (Variant): cyvcf2 Variant object.
-
- Returns:
- str|None: Variant ID as present in the VCF (or None).
- """
- return variant.ID
-
- def no_duplicated_identifier(
- self, vcf_reader: VCF, get_identifier: Callable
- ) -> bool:
- """Check that no duplicate identifiers are present in the VCF.
-
- Iterates through the VCF and ensures each identifier produced by get_identifier
- is unique.
-
- Args:
- vcf_reader (VCF): cyvcf2 VCF reader.
- get_identifier (Callable): Function that, given a Variant, returns an identifier.
-
- Returns:
- bool: True if no duplicates were found, False otherwise.
- """
- removal_status = {}
- for variant in vcf_reader:
- variant_identifier = get_identifier(variant)
- if variant_identifier in removal_status:
- return False
- removal_status[variant_identifier] = False
-
- return True
-
- def test_duplicate_positioned_id(self, vcf_reader):
- """Assert there are no duplicate positioned identifiers in the VCF."""
- assert self.no_duplicated_identifier(vcf_reader, self.get_positioned_id)
-
- def test_duplicate_id(self, vcf_reader):
- """Assert there are no duplicate variant IDs in the VCF."""
- assert self.no_duplicated_identifier(vcf_reader, self.get_id)
-
-
-class TestSrcCount:
- def get_total_variant_count_from_vcf(self, vcf: str) -> int:
- """Return the total number of variants in a VCF.
-
- Uses 'bcftools index --nrecords' and falls back to naive iteration if the command fails.
-
- Args:
- vcf (str): Path to VCF file.
-
- Returns:
- int: Number of records, or -1 on failure.
- """
- if vcf is None:
- logger.warning(f"Could not get variant count - no file provided")
- return -1
-
- process = subprocess.run(
- ["bcftools", "index", "--nrecords", vcf],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
-
- # if bcftools command fails try to naively iterate the file get count
- if process.returncode != 0:
- logger.warning(
- f"Could not get variant count from {vcf} using bcftools\n Will retry in naive iteration method"
- )
- try:
- local_vcf_reader = VCF(vcf)
-
- count = 0
- for _ in local_vcf_reader:
- count += 1
- local_vcf_reader.close()
-
- return count
- except:
- return -1
-
- return int(process.stdout.decode().strip())
-
- def get_variant_count_from_vcf_by_chr(self, vcf: str) -> dict:
- """Return per-chromosome variant counts for a VCF.
-
- Attempts to use 'bcftools index --stats' and falls back to iterating by chromosome
- if the command fails.
-
- Args:
- vcf (str): Path to VCF file.
-
- Returns:
- dict|int: Mapping {chrom: count} or -1 on failure.
- """
- if vcf is None:
- logger.warning(f"Could not get variant count - no file provided")
- return -1
-
- process = subprocess.run(
- ["bcftools", "index", "--stats", vcf],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
-
- # if bcftools command fails try to naively iterate the file get count
- if process.returncode != 0:
- logger.warning(
- f"Could not get variant count from {vcf} using bcftools\n Will retry in naive iteration method"
- )
- try:
- local_vcf_reader = VCF(vcf)
- chrs = local_vcf_reader.seqnames
-
- chrom_variant_counts = {}
- for chrom in chrs:
- count = 0
- # to be safe assuming 100 billion to be max bp in a chr
- for _ in local_vcf_reader(f"{chrom}:1-100000000000"):
- count += 1
- chrom_variant_counts[chrom] = count
- local_vcf_reader.close()
-
- return chrom_variant_counts
- except:
- return -1
-
- chrom_variant_counts = {}
- for chrom_stat in process.stdout.decode().strip().split("\n"):
- (chrom, contig, count) = chrom_stat.split("\t")
- chrom_variant_counts[chrom] = int(count)
-
- return chrom_variant_counts
-
- def test_compare_count_with_source(self, vcf, source_vcf):
- variant_count = self.get_total_variant_count_from_vcf(vcf)
- source_variant_count = self.get_total_variant_count_from_vcf(source_vcf)
-
- assert variant_count != -1
- assert source_variant_count != -1
- assert variant_count > source_variant_count * 0.90
-
- def test_compare_count_with_source_by_chr(self, vcf_reader, vcf, source_vcf):
- chrs = vcf_reader.seqnames
- variant_counts = self.get_variant_count_from_vcf_by_chr(vcf)
- source_variant_counts = self.get_variant_count_from_vcf_by_chr(source_vcf)
-
- assert variant_counts != -1
- assert source_variant_counts != -1
-
- for chr in chrs:
- # TBD: all chr will not be present in source VCF as vcf_prepper rename some of them
- # TBD: all chr will not be present in vcf file as vcf_prepper remove some chr variant
- if chr in variant_counts and chr in source_variant_counts:
- assert variant_counts[chr] > source_variant_counts[chr] * 0.95
-
-
-class TestContent:
- def test_csq_content(self, vcf_reader):
- """Sample CSQ annotations from random variants and check field presence.
-
- Selects a number of variants and ensures specified CSQ fields are present or
- allowed to be empty according to CSQ_FIELDS configuration.
- """
- NO_VARIANTS = 100
- NO_ITER = 100000
-
- csq_info_description = vcf_reader.get_header_type("CSQ")["Description"]
- csq_list = [
- csq.strip() for csq in csq_info_description.split("Format: ")[1].split("|")
- ]
-
- csq_field_idx = {}
- for csq_field in CSQ_FIELDS:
- if csq_field in csq_list:
- csq_field_idx[csq_field] = csq_list.index(csq_field)
-
- chrs = vcf_reader.seqnames
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- variants.append(variant)
- break
-
- iter += 1
-
- csq_field_cnt = {}
- for csq_field in csq_field_idx:
- csq_field_cnt[csq_field] = 0
-
- for variant in variants:
- csq = variant.INFO["CSQ"].split(",")[0]
- csq_field_vals = csq.split("|")
- for csq_field in csq_field_idx:
- if csq_field_vals[csq_field_idx[csq_field]] != "":
- csq_field_cnt[csq_field] += 1
-
- for csq_field in csq_field_idx:
- if "empty_value" in CSQ_FIELDS[csq_field]:
- canbe_empty = CSQ_FIELDS[csq_field]["empty_value"]
- else:
- canbe_empty = True
-
- if not canbe_empty:
- assert csq_field_cnt[csq_field] == NO_VARIANTS
- else:
- logger.info(
- f"{csq_field} count: {csq_field_cnt[csq_field]} expected: {NO_VARIANTS * 0.5}"
- )
-
-
-class TestSummaryStatistics:
- PER_ALLELE_FIELDS = {
- "transcript_consequence": "NTCSQ",
- "regulatory_consequence": "NRCSQ",
- "gene": "NGENE",
- "variant_phenotype": "NVPHN",
- "gene_phenotype": "NGPHN",
- }
-
- SKIP_CONSEQUENCE = [
- "downstream_gene_variant",
- "upstream_gene_variant",
- "intergenic_variant",
- "TF_binding_site_variant",
- "TFBS_ablation",
- "TFBS_amplification",
- ]
-
- def test_summary_statistics_per_variant(self, vcf_reader):
- """Validate per-variant summary fields such as NCITE against CSQ PUBMED entries."""
- NO_VARIANTS = 100
- NO_ITER = 100000
-
- csq_info_description = vcf_reader.get_header_type("CSQ")["Description"]
- csq_list = [
- csq.strip() for csq in csq_info_description.split("Format: ")[1].split("|")
- ]
-
- csq_field_idx = {}
- for csq_field in csq_list:
- csq_field_idx[csq_field] = csq_list.index(csq_field)
-
- chrs = vcf_reader.seqnames
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- citation = set()
-
- csqs = variant.INFO["CSQ"]
- for csq in csqs.split(","):
- csq_values = csq.split("|")
-
- if "PUBMED" in csq_field_idx:
- cites = csq_values[csq_field_idx["PUBMED"]]
- for cite in cites.split("&"):
- if cite != "":
- citation.add(cite)
-
- if len(citation) > 0:
- assert len(citation) == int(variant.INFO["NCITE"])
- else:
- assert "NCITE" not in variant.INFO
-
- iter += 1
-
- def test_summary_statistics_per_allele(self, vcf_reader, species):
- """Validate per-allele summary fields (NTCSQ, NRCSQ, NGENE, NGPHN, NVPHN) computed from CSQ."""
- NO_VARIANTS = 100
- NO_ITER = 100000
-
- csq_info_description = vcf_reader.get_header_type("CSQ")["Description"]
- csq_list = [
- csq.strip() for csq in csq_info_description.split("Format: ")[1].split("|")
- ]
-
- csq_field_idx = {}
- for csq_field in csq_list:
- csq_field_idx[csq_field] = csq_list.index(csq_field)
-
- chrs = vcf_reader.seqnames
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- transcript_consequence = {}
- regulatory_consequence = {}
- gene = {}
- gene_phenotype = {}
- variant_phenotype = {}
-
- csqs = variant.INFO["CSQ"]
- for csq in csqs.split(","):
- csq_values = csq.split("|")
-
- allele = csq_values[csq_field_idx["Allele"]]
- consequences = csq_values[csq_field_idx["Consequence"]]
- feature_stable_id = csq_values[csq_field_idx["Feature"]]
-
- for csq in consequences.split("&"):
- if csq not in self.SKIP_CONSEQUENCE:
- if csq.startswith("regulatory"):
- if allele not in regulatory_consequence:
- regulatory_consequence[allele] = set()
- regulatory_consequence[allele].add(
- f"{feature_stable_id}:{consequences}"
- )
- else:
- if allele not in transcript_consequence:
- transcript_consequence[allele] = set()
- transcript_consequence[allele].add(
- f"{feature_stable_id}:{consequences}"
- )
- if allele not in gene:
- gene[allele] = set()
- gene[allele].add(csq_values[csq_field_idx["Gene"]])
-
- if "PHENOTYPES" in csq_field_idx:
- phenotypes = csq_values[csq_field_idx["PHENOTYPES"]]
- for phenotype in phenotypes.split("&"):
- pheno_per_allele_fields = phenotype.split("+")
- if len(pheno_per_allele_fields) != 3:
- continue
-
- (name, source, feature) = pheno_per_allele_fields
- if feature.startswith("ENS"):
- if allele not in gene_phenotype:
- gene_phenotype[allele] = set()
- gene_phenotype[allele].add(f"{name}:{source}:{feature}")
- else:
- if allele not in variant_phenotype:
- variant_phenotype[allele] = set()
- variant_phenotype[allele].add(
- f"{name}:{source}:{feature}"
- )
-
- if len(regulatory_consequence) > 1:
- assert sorted(
- [len(val) for val in regulatory_consequence.values()]
- ) == sorted(variant.INFO["NRCSQ"])
- elif len(regulatory_consequence) == 1:
- assert [len(val) for val in regulatory_consequence.values()] == [
- variant.INFO["NRCSQ"]
- ]
- else:
- assert "NRCSQ" not in variant.INFO
-
- if len(transcript_consequence) > 1:
- assert sorted(
- [len(val) for val in transcript_consequence.values()]
- ) == sorted(variant.INFO["NTCSQ"])
- elif len(transcript_consequence) == 1:
- assert [len(val) for val in transcript_consequence.values()] == [
- variant.INFO["NTCSQ"]
- ]
- else:
- assert "NTCSQ" not in variant.INFO
-
- if len(gene) > 1:
- assert sorted([len(val) for val in gene.values()]) == sorted(
- variant.INFO["NGENE"]
- )
- elif len(gene) == 1:
- assert [len(val) for val in gene.values()] == [
- variant.INFO["NGENE"]
- ]
- else:
- assert "NGENE" not in variant.INFO
-
- if len(gene_phenotype) > 1:
- assert sorted(
- [len(val) for val in gene_phenotype.values()]
- ) == sorted(variant.INFO["NGPHN"])
- elif len(gene_phenotype) == 1:
- assert [len(val) for val in gene_phenotype.values()] == [
- variant.INFO["NGPHN"]
- ]
- else:
- assert "NGPHN" not in variant.INFO
-
- if len(variant_phenotype) > 1:
- assert sorted(
- [len(val) for val in variant_phenotype.values()]
- ) == sorted(variant.INFO["NVPHN"])
- elif len(variant_phenotype) == 1:
- assert [len(val) for val in variant_phenotype.values()] == [
- variant.INFO["NVPHN"]
- ]
- else:
- assert "NVPHN" not in variant.INFO
-
- iter += 1
-
- def test_summary_statistics_frequency(self, vcf_reader, species):
- """Validate representative allele frequency (RAF) matches frequencies from CSQ fields.
-
- This test is only applicable to human species (GRCh38 and GRCh37).
- """
- if species not in ["homo_sapiens", "homo_sapiens_37"]:
- pytest.skip("Unsupported species, skipping ...")
-
- if species == "homo_sapiens":
- freq_csq_field = "gnomAD_genomes_AF"
- elif species == "homo_sapiens_37":
- freq_csq_field = "gnomAD_exomes_AF"
-
- NO_VARIANTS = 100
- NO_ITER = 100000
-
- csq_info_description = vcf_reader.get_header_type("CSQ")["Description"]
- csq_list = [
- csq.strip() for csq in csq_info_description.split("Format: ")[1].split("|")
- ]
-
- csq_field_idx = {}
- for csq_field in csq_list:
- csq_field_idx[csq_field] = csq_list.index(csq_field)
-
- assert freq_csq_field in csq_field_idx
-
- chrs = vcf_reader.seqnames
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- frequency = {}
-
- csqs = variant.INFO["CSQ"]
- for csq in csqs.split(","):
- csq_values = csq.split("|")
-
- allele = csq_values[csq_field_idx["Allele"]]
- freq = csq_values[csq_field_idx[freq_csq_field]]
-
- if freq != "":
- frequency[allele] = float(freq)
-
- if len(frequency) > 1:
- actual = sorted(frequency.values())
- got = sorted(
- [val for val in variant.INFO["RAF"] if val is not None]
- )
- for idx, _ in enumerate(actual):
- assert isclose(actual[idx], got[idx], rel_tol=1e-5)
- elif len(frequency) == 1:
- actual = frequency[list(frequency.keys())[0]]
- if type(variant.INFO["RAF"]) is tuple:
- got = [val for val in variant.INFO["RAF"] if val is not None]
- assert len(got) == 1
- assert isclose(actual, got[0], rel_tol=1e-5)
- else:
- assert isclose(actual, variant.INFO["RAF"], rel_tol=1e-5)
- else:
- assert "RAF" not in variant.INFO
-
- iter += 1
diff --git a/datachecks/tests/conftest.py b/datachecks/tests/conftest.py
new file mode 100755
index 00000000..44d7bd52
--- /dev/null
+++ b/datachecks/tests/conftest.py
@@ -0,0 +1,131 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import pyBigWig
+import random
+from cyvcf2 import VCF
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+NO_VARIANTS = 10000
+
+def pytest_addoption(parser):
+ """Add custom pytest command-line options.
+
+ Adds options to supply paths to vcf, bigbed, bigwig, source_vcf and species.
+ """
+ parser.addoption("--vcf", type=str, default=None)
+ parser.addoption("--bigbed", type=str, default=None)
+ parser.addoption("--bigwig", type=str, default=None)
+ parser.addoption("--source_vcf", type=str, default=None)
+ parser.addoption("--species", type=str, default=None)
+ parser.addoption("--skip_xfail", action="store_true", default=None)
+ parser.addoption("--no_rvariants", type=int, default=1000)
+
+
+def pytest_generate_tests(metafunc):
+ """Parametrise fixtures from provided command-line options.
+
+ For each known fixture name, set a single parameterised value from the CLI option.
+ """
+ if "vcf" in metafunc.fixturenames:
+ metafunc.parametrize("vcf", [metafunc.config.getoption("vcf")], ids=['vcf'], scope="session")
+ if "bigbed" in metafunc.fixturenames:
+ metafunc.parametrize("bigbed", [metafunc.config.getoption("bigbed")], ids=['bb'], scope="session")
+ if "bigwig" in metafunc.fixturenames:
+ metafunc.parametrize("bigwig", [metafunc.config.getoption("bigwig")], ids=['bw'], scope="session")
+ if "source_vcf" in metafunc.fixturenames:
+ metafunc.parametrize("source_vcf", [metafunc.config.getoption("source_vcf")], ids=['source_vcf'], scope="session")
+ if "species" in metafunc.fixturenames:
+ metafunc.parametrize("species", [metafunc.config.getoption("species")], ids=[metafunc.config.getoption("species")], scope="session")
+ if "skip_xfail" in metafunc.fixturenames:
+ metafunc.parametrize("skip_xfail", [metafunc.config.getoption("skip_xfail")], ids=[f"skip_xfail={metafunc.config.getoption('skip_xfail')}"], scope="session")
+ if "no_rvariants" in metafunc.fixturenames:
+ metafunc.parametrize("no_rvariants", [metafunc.config.getoption("no_rvariants")], ids=[f"no_variants={metafunc.config.getoption('no_rvariants')}"], scope="session")
+
+
+@pytest.fixture(scope="session")
+def vcf_reader(vcf):
+ """Provide a cyvcf2 VCF reader for the supplied VCF file path."""
+ vcf_reader = VCF(vcf)
+ return vcf_reader
+
+
+@pytest.fixture()
+def bb_reader(bigbed):
+ """Provide a pyBigWig reader opened on a bigBed file path."""
+ bb_reader = pyBigWig.open(bigbed)
+ return bb_reader
+
+
+@pytest.fixture()
+def bw_reader(bigwig):
+ """Provide a pyBigWig reader opened on a bigWig file path."""
+ bw_reader = pyBigWig.open(bigwig)
+ return bw_reader
+
+
+@pytest.fixture(scope="session")
+def variant_list(vcf_reader, no_rvariants):
+ test = {}
+ csq_info_description = vcf_reader.get_header_type("CSQ")["Description"]
+ csq_list = [
+ csq.strip() for csq in csq_info_description.split("Format: ")[1].split("|")
+ ]
+ csq_field_idx = {}
+ for csq_field in csq_list:
+ csq_field_idx[csq_field] = csq_list.index(csq_field)
+ summary_stats_fields = ['NVPHN', 'NGPHN', 'NTCSQ', 'NRCSQ', 'NGENE', 'NCITE', 'RAF']
+
+ chrs = vcf_reader.seqnames
+ variant_list = {}
+ total_no_variants = 0
+ iteration = 0
+ while total_no_variants < NO_VARIANTS:
+ chr = random.choice(chrs)
+ start = random.choice(range(1000, 100000000))
+
+ no_variants = 0
+ for variant in vcf_reader(f"{chr}:{start}"):
+ variant_id = variant.ID
+ variant_list[variant_id] = {
+ 'chrom': variant.CHROM,
+ 'pos': variant.POS,
+ 'ref': variant.REF,
+ 'alts': variant.ALT,
+ 'csqs': []
+ }
+
+ for csq in variant.INFO["CSQ"].split(","):
+ csq_field_vals = csq.split("|")
+ csq_hash = {csq_list[idx]:csq_val for idx, csq_val in enumerate(csq_field_vals)}
+ variant_list[variant_id]['csqs'].append(csq_hash)
+
+ for ss_field in summary_stats_fields:
+ variant_list[variant_id][ss_field] = variant.INFO.get(ss_field, None)
+
+ total_no_variants += 1
+ no_variants += 1
+ if no_variants > 10:
+ break
+
+ # break the infinite loop - we can sometimes may not be able to gather required number of variant
+ iteration += 1
+ if iteration > no_rvariants:
+ break
+
+ return variant_list
\ No newline at end of file
diff --git a/datachecks/test_bigbed.py b/datachecks/tests/test_bigbed.py
similarity index 58%
rename from datachecks/test_bigbed.py
rename to datachecks/tests/test_bigbed.py
index a3d8b0d9..2582eb66 100755
--- a/datachecks/test_bigbed.py
+++ b/datachecks/tests/test_bigbed.py
@@ -12,11 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import pytest
import os
import subprocess
-import random
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
class TestFile:
def test_exist(self, bigbed):
@@ -38,8 +39,8 @@ def get_total_variant_count_from_vcf(self, vcf: str) -> int:
Returns:
int: Number of records or -1 on failure.
"""
- if vcf is None:
- logger.warning(f"Could not get variant count - no file provided")
+ if not os.path.isfile(vcf):
+ logger.warning(f"Could not get variant count from VCF - file not found")
return -1
process = subprocess.run(
@@ -49,66 +50,66 @@ def get_total_variant_count_from_vcf(self, vcf: str) -> int:
)
if process.returncode != 0:
- logger.warning(f"Could not get variant count from vcf - {vcf}")
+ logger.warning(f"Could not get variant count from VCF - {process.stderr.decode()}")
return -1
return int(process.stdout.decode().strip())
- def get_total_variant_count_from_bb(self, bb_reader) -> int:
+ def get_total_variant_count_from_bb(self, bigbed) -> int:
"""Count entries across all chromosomes in a bigBed reader.
Args:
- bb_reader: bigBed reader instance.
+ bigbed (str): Path to bigBed file.
Returns:
int: Total number of entries.
"""
- variant_counts = 0
- for chr in bb_reader.chroms():
- variant_counts += len(bb_reader.entries(chr, 0, bb_reader.chroms(chr)))
- return variant_counts
+ if not os.path.isfile(bigbed):
+ logger.warning(f"Could not get variant count in bigBed - file not found")
+ return -1
+
+ process = subprocess.run(
+ ["bigBedInfo", bigbed],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
- def test_compare_count_with_source(self, vcf, bb_reader):
+ if process.returncode != 0:
+ logger.warning(f"Could not get variant count in bigBed - {process.stderr.decode()}")
+ return -1
+
+ bb_summaries = process.stdout.decode().split("\n")
+ for line in bb_summaries:
+ if "itemCount" in line:
+ return int(line.split(":")[1].replace(",", "").strip())
+
+ logger.warning(f"Could not get variant count in bigBed - failed parsing bigBedInfo output")
+ return -1
+
+ def test_compare_count_with_source(self, vcf, bigbed):
"""Compare approximate counts between VCF and bigBed-derived counts."""
variant_count_vcf = self.get_total_variant_count_from_vcf(vcf)
- variant_count_bb = self.get_total_variant_count_from_bb(bb_reader)
+ variant_count_bb = self.get_total_variant_count_from_bb(bigbed)
assert variant_count_bb > variant_count_vcf * 0.95
class TestSrcExistence:
- def test_variant_exist_from_source(self, bb_reader, vcf_reader):
+ def test_variant_exist_from_source(self, bb_reader, variant_list):
"""Sample variants from VCF and ensure corresponding bigBed entries exist and include IDs."""
- NO_VARIANTS = 100
- NO_ITER = 100000
-
- chrs = vcf_reader.seqnames
-
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- variants.append(variant)
- break
-
- iter += 1
-
- for variant in variants:
- chr = variant.CHROM
- start = int(variant.POS) - 1
+
+ for variant_id in variant_list:
+ chr = variant_list[variant_id]['chrom']
+ start = variant_list[variant_id]['pos'] - 1
end = start + 2 # for insertion
bb_entries = bb_reader.entries(chr, start, end)
- assert bb_entries is not None
- assert len(bb_entries) >= 1
+ if bb_entries is None or len(bb_entries) < 1:
+ raise AssertionError(f"bigBed entries does not exist or match with source - {chr}:{start}-{end}")
- id_in_vcf = variant.ID
ids_in_bb = []
for bb_entry in bb_entries:
id = bb_entry[2].split("\t")[0]
ids_in_bb.append(id)
- assert id_in_vcf in ids_in_bb
+ assert variant_id in ids_in_bb
\ No newline at end of file
diff --git a/datachecks/test_bigwig.py b/datachecks/tests/test_bigwig.py
similarity index 64%
rename from datachecks/test_bigwig.py
rename to datachecks/tests/test_bigwig.py
index e1b45fec..61ef1606 100755
--- a/datachecks/test_bigwig.py
+++ b/datachecks/tests/test_bigwig.py
@@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import pytest
import os
import subprocess
-import random
+import numpy as np
+import logging
+from cyvcf2 import VCF
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
class TestFile:
def test_exist(self, bigwig):
@@ -29,32 +32,17 @@ def test_validity(self, bw_reader):
class TestSrcExistence:
- def test_variant_exist_from_source(self, bw_reader, vcf_reader):
+ def test_variant_exist_from_source(self, bw_reader, variant_list):
"""Sample variants from VCF and ensure BigWig has non-zero scores at those positions."""
- NO_VARIANTS = 100
- NO_ITER = 100000
- chrs = vcf_reader.seqnames
-
- variants = []
- iter = 0
- while len(variants) < NO_VARIANTS and iter <= NO_ITER:
- chr = random.choice(chrs)
- start = random.choice(range(10000, 1000000))
-
- for variant in vcf_reader(f"{chr}:{start}"):
- variants.append(variant)
- break
-
- iter += 1
-
- for variant in variants:
- chr = variant.CHROM
- start = int(variant.POS) - 1
+ for variant_id in variant_list:
+ chr = variant_list[variant_id]["chrom"]
+ start = variant_list[variant_id]["pos"] - 1
end = start + 2
bw_state = bw_reader.stats(chr, start, end)[0]
- assert bw_state > 0.0
+ if bw_state is None or not bw_state > 0.0:
+ raise AssertionError(f"bigWig value does exist or match with source - {chr}:{start}-{end}")
class TestSrcCount:
@@ -83,7 +71,7 @@ def get_total_variant_count_from_vcf(self, vcf: str) -> int:
return int(process.stdout.decode().strip())
- def get_total_variant_count_from_bw(self, bw_reader) -> int:
+ def get_total_variant_count_from_bw(self, bw_reader, chroms = None) -> int:
"""Count non-zero value positions across all chromosomes in a BigWig.
Args:
@@ -93,18 +81,31 @@ def get_total_variant_count_from_bw(self, bw_reader) -> int:
int: Count of positions with value > 0.0.
"""
variant_counts = 0
- for chr in bw_reader.chroms():
- non_zero_vals = [
- val
- for val in bw_reader.values(chr, 0, bw_reader.chroms(chr))
- if val > 0.0
- ]
- variant_counts += len(non_zero_vals)
+
+ if chroms is None:
+ chroms = bw_reader.chroms()
+
+ for chr in chroms:
+ end = bw_reader.chroms(chr)
+
+ window = 10000000
+ for s_i in range(0, end, window):
+ e_i = min(s_i+window, end)
+ values = np.array(bw_reader.values(chr, s_i, e_i))
+ variant_counts += np.count_nonzero(values)
return variant_counts
def test_compare_count_with_source(self, vcf, bw_reader):
"""Compare approximate variant counts between source VCF and BigWig-derived counts."""
variant_count_vcf = self.get_total_variant_count_from_vcf(vcf)
- variant_count_bw = self.get_total_variant_count_from_bw(bw_reader)
- assert variant_count_bw > variant_count_vcf * 0.95
+ variant_count_bw = 0
+ try:
+ vcf_reader = VCF(vcf)
+ chroms = vcf_reader.seqnames
+ vcf_reader.close()
+ variant_count_bw = self.get_total_variant_count_from_bw(bw_reader, chroms)
+ except:
+ variant_count_bw = self.get_total_variant_count_from_bw(bw_reader)
+
+ assert variant_count_bw > variant_count_vcf * 0.95
\ No newline at end of file
diff --git a/datachecks/tests/test_vcf.py b/datachecks/tests/test_vcf.py
new file mode 100755
index 00000000..26734b58
--- /dev/null
+++ b/datachecks/tests/test_vcf.py
@@ -0,0 +1,517 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import os
+from cyvcf2 import VCF
+from cyvcf2.cyvcf2 import Variant
+from typing import Callable
+import subprocess
+from math import isclose
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+CSQ_FIELDS = {
+ "Allele": {"empty_value": False, "field_existance": "all"},
+ "Consequence": {"empty_value": False, "field_existance": "all"},
+ "Feature": {"field_existance": "all"},
+ "VARIANT_CLASS": {"empty_value": False, "field_existance": "all"},
+ "SPDI": {"empty_value": False, "field_existance": "all"},
+ "PUBMED": {"field_existance": "all"},
+ "VAR_SYNONYMS": {"field_existance": "all"},
+ "PHENOTYPES": {},
+ "Conservation": {"field_existance": "homo_sapiens"},
+ "CADD_PHRED": {},
+ "AA": {},
+ "SIFT": {},
+ "PolyPhen": {},
+ "gnomAD_exomes_AF": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AF_sas": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AC_sas": {"field_existance": "homo_sapiens"},
+ "gnomAD_exomes_AN_sas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_afr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_amr": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_asj": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_eas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_fin": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_nfe": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_oth": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AF_sas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AC_sas": {"field_existance": "homo_sapiens"},
+ "gnomAD_genomes_AN_sas": {"field_existance": "homo_sapiens"},
+ "AF": {"field_existance": "homo_sapiens"},
+ "AFR_AF": {"field_existance": "homo_sapiens"},
+ "AMR_AF": {"field_existance": "homo_sapiens"},
+ "EAS_AF": {"field_existance": "homo_sapiens"},
+ "EUR_AF": {"field_existance": "homo_sapiens"},
+ "SAS_AF": {"field_existance": "homo_sapiens"},
+}
+
+class TestFile:
+ def test_exist(self, vcf):
+ """Assert that the provided VCF path exists on disk.
+
+ Args:
+ vcf (str): Path to the VCF file passed via pytest options.
+
+ Raises:
+ AssertionError: If the file does not exist.
+ """
+ assert os.path.isfile(vcf)
+ assert os.path.isfile(vcf + ".tbi") or os.path.isfile(vcf + ".csi")
+
+
+class TestHeader:
+ def test_file_format(self, vcf_reader):
+ """Assert VCF file defines a fileformat header entry."""
+ assert vcf_reader.get_header_type("fileformat")
+
+ def test_header_line(self, vcf_reader):
+ """Assert the VCF header contains the mandatory CHROM/POS/ID/REF/ALT columns."""
+ header_line = vcf_reader.raw_header.split("\n")[-2]
+ assert header_line.startswith("#CHROM\tPOS\tID\tREF\tALT")
+
+ def test_csq(self, vcf_reader):
+ """Verify CSQ exists in header
+ """
+ assert vcf_reader.get_header_type("CSQ")
+
+ def test_source_info(self, vcf_reader):
+ """Verify source info exist in header
+ """
+ assert vcf_reader.get_header_type("source")
+
+@pytest.mark.skip(reason="takes too much memory + not much relevance")
+class TestDuplicate:
+ def get_positioned_id(self, variant: Variant) -> str:
+ """Return a positioned identifier for a variant (chrom:pos:id).
+
+ Args:
+ variant (Variant): cyvcf2 Variant object.
+
+ Returns:
+ str: Formatted positioned identifier.
+ """
+ id = variant.ID or "unknown"
+ return variant.CHROM + ":" + str(variant.POS) + ":" + id
+
+ def get_id(self, variant: Variant) -> str:
+ """Return the raw variant ID.
+
+ Args:
+ variant (Variant): cyvcf2 Variant object.
+
+ Returns:
+ str|None: Variant ID as present in the VCF (or None).
+ """
+ return variant.ID
+
+ def no_duplicated_identifier(
+ self, vcf_reader: VCF, get_identifier: Callable
+ ) -> bool:
+ """Check that no duplicate identifiers are present in the VCF.
+
+ Iterates through the VCF and ensures each identifier produced by get_identifier
+ is unique.
+
+ Args:
+ vcf_reader (VCF): cyvcf2 VCF reader.
+ get_identifier (Callable): Function that, given a Variant, returns an identifier.
+
+ Returns:
+ bool: True if no duplicates were found, False otherwise.
+ """
+ removal_status = {}
+ for variant in vcf_reader:
+ variant_identifier = get_identifier(variant)
+ if variant_identifier in removal_status:
+ return False
+ removal_status[variant_identifier] = False
+
+ return True
+
+ def test_duplicate_positioned_id(self, vcf_reader):
+ """Assert there are no duplicate positioned identifiers in the VCF."""
+ assert self.no_duplicated_identifier(vcf_reader, self.get_positioned_id)
+
+ def test_duplicate_id(self, vcf_reader):
+ """Assert there are no duplicate variant IDs in the VCF."""
+ assert self.no_duplicated_identifier(vcf_reader, self.get_id)
+
+
+class TestSrcCount:
+ def get_total_variant_count_from_vcf(self, vcf: str) -> int:
+ """Return the total number of variants in a VCF.
+
+ Uses 'bcftools index --nrecords' and falls back to naive iteration if the command fails.
+
+ Args:
+ vcf (str): Path to VCF file.
+
+ Returns:
+ int: Number of records, or -1 on failure.
+ """
+ if vcf is None:
+ logger.warning(f"Could not get variant count - no file provided")
+ return -1
+
+ process = subprocess.run(
+ ["bcftools", "index", "--nrecords", vcf],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ # if bcftools command fails try to naively iterate the file get count
+ if process.returncode != 0:
+ logger.warning(
+ f"Could not get variant count from {vcf} using bcftools\n Will retry in naive iteration method"
+ )
+ try:
+ local_vcf_reader = VCF(vcf)
+
+ count = 0
+ for _ in local_vcf_reader:
+ count += 1
+ local_vcf_reader.close()
+
+ return count
+ except:
+ return -1
+
+ return int(process.stdout.decode().strip())
+
+ def get_variant_count_from_vcf_by_chr(self, vcf: str) -> dict:
+ """Return per-chromosome variant counts for a VCF.
+
+ Attempts to use 'bcftools index --stats' and falls back to iterating by chromosome
+ if the command fails.
+
+ Args:
+ vcf (str): Path to VCF file.
+
+ Returns:
+ dict|int: Mapping {chrom: count} or -1 on failure.
+ """
+ if vcf is None:
+ logger.warning(f"Could not get variant count - no file provided")
+ return -1
+
+ process = subprocess.run(
+ ["bcftools", "index", "--stats", vcf],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ # if bcftools command fails try to naively iterate the file get count
+ if process.returncode != 0:
+ logger.warning(
+ f"Could not get variant count from {vcf} using bcftools\n Will retry in naive iteration method"
+ )
+ try:
+ local_vcf_reader = VCF(vcf)
+ chrs = local_vcf_reader.seqnames
+
+ chrom_variant_counts = {}
+ for chrom in chrs:
+ count = 0
+ # to be safe assuming 100 billion to be max bp in a chr
+ for _ in local_vcf_reader(f"{chrom}:1-100000000000"):
+ count += 1
+ chrom_variant_counts[chrom] = count
+ local_vcf_reader.close()
+
+ return chrom_variant_counts
+ except:
+ return -1
+
+ chrom_variant_counts = {}
+ for chrom_stat in process.stdout.decode().strip().split("\n"):
+ (chrom, contig, count) = chrom_stat.split("\t")
+ chrom_variant_counts[chrom] = int(count)
+
+ return chrom_variant_counts
+
+ def test_compare_count_with_source(self, vcf, source_vcf):
+ variant_count = self.get_total_variant_count_from_vcf(vcf)
+ source_variant_count = self.get_total_variant_count_from_vcf(source_vcf)
+
+ assert variant_count != -1
+ assert source_variant_count != -1
+ assert variant_count > source_variant_count * 0.90
+
+ def test_compare_count_with_source_by_chr(self, vcf_reader, vcf, source_vcf):
+ chrs = vcf_reader.seqnames
+ variant_counts = self.get_variant_count_from_vcf_by_chr(vcf)
+ source_variant_counts = self.get_variant_count_from_vcf_by_chr(source_vcf)
+
+ assert variant_counts != -1
+ assert source_variant_counts != -1
+
+ for chr in chrs:
+ # TBD: all chr will not be present in source VCF as vcf_prepper rename some of them
+ # TBD: all chr will not be present in vcf file as vcf_prepper remove some chr variant
+ if chr in variant_counts and chr in source_variant_counts:
+ assert variant_counts[chr] > source_variant_counts[chr] * 0.95
+
+class TestContent:
+ def get_csq_params():
+ params = []
+ for csq_field in CSQ_FIELDS:
+ canbe_empty = True
+ if "empty_value" in CSQ_FIELDS[csq_field]:
+ canbe_empty = CSQ_FIELDS[csq_field]["empty_value"]
+
+ if canbe_empty:
+ params.append(pytest.param(
+ csq_field, canbe_empty,
+ marks=[pytest.mark.xfail(reason=f"{csq_field} CSQ field can be empty")],
+ id=csq_field
+ ))
+ else:
+ params.append(pytest.param(
+ csq_field, canbe_empty,
+ id=csq_field
+ ))
+ return params
+
+ @pytest.mark.parametrize("csq_field, canbe_empty", get_csq_params())
+ def test_csq_content(self, variant_list, csq_field, canbe_empty, skip_xfail):
+ """Sample CSQ annotations from random variants and check field presence.
+
+ Selects a number of variants and ensures specified CSQ fields are present or
+ allowed to be empty according to CSQ_FIELDS configuration.
+ """
+
+ csq_field_cnt = 0
+ for variant_id in variant_list:
+ first_csq = variant_list[variant_id]['csqs'][0]
+ if first_csq.get(csq_field, "") != "":
+ csq_field_cnt += 1
+
+ max_variants = len(variant_list)
+ if not canbe_empty:
+ assert csq_field_cnt == max_variants
+ else:
+ if skip_xfail:
+ pytest.skip('Skipping xfail')
+ assert csq_field_cnt > 0
+
+
+class TestSummaryStatistics:
+ PER_ALLELE_FIELDS = {
+ "transcript_consequence": "NTCSQ",
+ "regulatory_consequence": "NRCSQ",
+ "gene": "NGENE",
+ "variant_phenotype": "NVPHN",
+ "gene_phenotype": "NGPHN",
+ }
+
+ SKIP_CONSEQUENCE = [
+ "downstream_gene_variant",
+ "upstream_gene_variant",
+ "intergenic_variant",
+ "TF_binding_site_variant",
+ "TFBS_ablation",
+ "TFBS_amplification",
+ ]
+
+ def test_summary_statistics_per_variant(self, variant_list):
+ """Validate per-variant summary fields such as NCITE against CSQ PUBMED entries."""
+
+ for variant_id in variant_list:
+ chrom = variant_list[variant_id]["chrom"]
+ pos = variant_list[variant_id]["pos"]
+ citation = set()
+
+ csqs = variant_list[variant_id]['csqs']
+ for csq in csqs:
+ if "PUBMED" in csq:
+ cites = csq.get("PUBMED", "")
+ for cite in cites.split("&"):
+ if cite != "":
+ citation.add(cite)
+
+ if len(citation) > 0:
+ actual = len(citation)
+ got = int(variant_list[variant_id]['NCITE'])
+ if not actual == got:
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id}] actual - {actual}; got - {got}")
+
+ def test_summary_statistics_per_allele(self, variant_list):
+ """Validate per-allele summary fields (NTCSQ, NRCSQ, NGENE, NGPHN, NVPHN) computed from CSQ."""
+
+ for variant_id in variant_list:
+ chrom = variant_list[variant_id]["chrom"]
+ pos = variant_list[variant_id]["pos"]
+ transcript_consequence = {}
+ regulatory_consequence = {}
+ gene = {}
+ gene_phenotype = {}
+ variant_phenotype = {}
+
+ csqs = variant_list[variant_id]['csqs']
+ for csq in csqs:
+ allele = csq["Allele"]
+ consequences = csq["Consequence"]
+ feature_stable_id = csq["Feature"]
+
+ for consequence in consequences.split("&"):
+ if consequence not in self.SKIP_CONSEQUENCE:
+ if consequence.startswith("regulatory"):
+ if allele not in regulatory_consequence:
+ regulatory_consequence[allele] = set()
+ regulatory_consequence[allele].add(
+ f"{feature_stable_id}:{consequences}"
+ )
+ else:
+ if allele not in transcript_consequence:
+ transcript_consequence[allele] = set()
+ transcript_consequence[allele].add(
+ f"{feature_stable_id}:{consequences}"
+ )
+ if allele not in gene:
+ gene[allele] = set()
+ gene[allele].add(csq["Gene"])
+
+ phenotypes = csq.get("PHENOTYPES", "")
+ for phenotype in phenotypes.split("&"):
+ pheno_per_allele_fields = phenotype.split("+")
+ if len(pheno_per_allele_fields) != 3:
+ continue
+
+ (name, source, feature) = pheno_per_allele_fields
+ if feature.startswith("ENS"):
+ if allele not in gene_phenotype:
+ gene_phenotype[allele] = set()
+ gene_phenotype[allele].add(f"{name}:{source}:{feature}")
+ else:
+ if allele not in variant_phenotype:
+ variant_phenotype[allele] = set()
+ variant_phenotype[allele].add(
+ f"{name}:{source}:{feature}"
+ )
+
+ for field in self.PER_ALLELE_FIELDS:
+ ss_info_field = self.PER_ALLELE_FIELDS[field]
+ field_object = locals()[field]
+
+ if len(field_object) > 1:
+ actual = sorted([len(val) for val in field_object.values()])
+ got = sorted(variant_list[variant_id][ss_info_field])
+ if not actual == got:
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id} - {ss_info_field}] actual - {actual}; got - {got}")
+ elif len(field_object) == 1:
+ actual = [len(val) for val in field_object.values()]
+ got = [variant_list[variant_id][ss_info_field]]
+ if not actual == got:
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id} - {ss_info_field}] actual - {actual}; got - {got}")
+ else:
+ got = variant_list[variant_id][ss_info_field]
+ if got is not None:
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id} - {ss_info_field}] actual - None; got - {got}")
+
+ def test_summary_statistics_frequency(self, variant_list, species):
+ """Validate representative allele frequency (RAF) matches frequencies from CSQ fields.
+
+ This test is only applicable to human species (GRCh38 and GRCh37).
+ """
+ if not species.startswith("homo_sapiens"):
+ pytest.skip("Unsupported species, skipping ...")
+
+ freq_csq_field = "gnomAD_genomes_AF"
+ for variant_id in variant_list:
+ chrom = variant_list[variant_id]["chrom"]
+ pos = variant_list[variant_id]["pos"]
+ frequency = {}
+
+ csqs = variant_list[variant_id]['csqs']
+ skip_variant = False
+ for csq in csqs:
+ allele = csq["Allele"]
+ freq = csq[freq_csq_field]
+
+ if freq != "":
+ # in some cases we have multiple frequency separated by & which we do not handle yet
+ if "&" in freq:
+ skip_variant = True
+ continue
+
+ frequency[allele] = float(freq)
+
+ if skip_variant:
+ continue
+
+ a_t = 1.0
+ r_t = 1e-5
+ if len(frequency) > 1:
+ actual = sorted(frequency.values())
+ got = sorted(
+ [val for val in variant_list[variant_id]["RAF"] if val is not None]
+ )
+ for idx, _ in enumerate(actual):
+ if not isclose(actual[idx], got[idx], rel_tol=r_t, abs_tol=a_t):
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id}] actual - {actual[idx]}; got - {got[idx]}")
+ elif len(frequency) == 1:
+ actual = frequency[list(frequency.keys())[0]]
+ if type(variant_list[variant_id]["RAF"]) is tuple:
+ got = [val for val in variant_list[variant_id]["RAF"] if val is not None]
+ if (not len(got) == 1) or (not isclose(actual, got[0], rel_tol=r_t, abs_tol=a_t)):
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id}] actual - {actual}; got - {got[0]}")
+ else:
+ if (variant_list[variant_id]["RAF"] is None) or (not isclose(actual, variant_list[variant_id]["RAF"], rel_tol=r_t, abs_tol=a_t)):
+ raise AssertionError(f"[{chrom}:{pos}:{variant_id}] actual - {actual}; got - {variant_list[variant_id]['RAF']}")
+ else:
+ assert variant_list[variant_id]["RAF"] is None
\ No newline at end of file
diff --git a/main.nf b/main.nf
new file mode 100644
index 00000000..e69de29b
diff --git a/nextflow/vcf_prepper/assets/population_data.json b/nextflow/vcf_prepper/assets/population_data.json
index 21ab3022..e0d578c1 100644
--- a/nextflow/vcf_prepper/assets/population_data.json
+++ b/nextflow/vcf_prepper/assets/population_data.json
@@ -395,7 +395,205 @@
]
}
],
- "homo_sapiens_gca\\d{9}v\\d{1}": [
+ "homo_sapiens_gca009914755v4": [
+ {
+ "name": "gnomAD_exomes",
+ "version": "4.1",
+ "files": [
+ {
+ "version": "4.1",
+ "short_name": "gnomAD_exomes",
+ "file_location": "/nfs/production/flicek/ensembl/production/ensemblftp/rapid-release/species/Homo_sapiens/##ASSEMBLY_ACC##/ensembl/variation/2022_10/vcf/2024_07/gnomad.exomes.v4.1.sites.##ASSEMBLY_ACC##.trimmed_liftover.vcf.gz",
+ "include_fields": [
+ {
+ "name": "gnomADe:ALL",
+ "fields": {
+ "af": "AF",
+ "ac": "AC",
+ "an": "AN"
+ }
+ },
+ {
+ "name": "gnomADe:afr",
+ "fields": {
+ "af": "AF_afr",
+ "ac": "AC_afr",
+ "an": "AN_afr"
+ }
+ },
+ {
+ "name": "gnomADe:amr",
+ "fields": {
+ "af": "AF_amr",
+ "ac": "AC_amr",
+ "an": "AN_amr"
+ }
+ },
+ {
+ "name": "gnomADe:asj",
+ "fields": {
+ "af": "AF_asj",
+ "ac": "AC_asj",
+ "an": "AN_asj"
+ }
+ },
+ {
+ "name": "gnomADe:eas",
+ "fields": {
+ "af": "AF_eas",
+ "ac": "AC_eas",
+ "an": "AN_eas"
+ }
+ },
+ {
+ "name": "gnomADe:fin",
+ "fields": {
+ "af": "AF_fin",
+ "ac": "AC_fin",
+ "an": "AN_fin"
+ }
+ },
+ {
+ "name": "gnomADe:mid",
+ "fields": {
+ "af": "AF_mid",
+ "ac": "AC_mid",
+ "an": "AN_mid"
+ }
+ },
+ {
+ "name": "gnomADe:nfe",
+ "fields": {
+ "af": "AF_nfe",
+ "ac": "AC_nfe",
+ "an": "AN_nfe"
+ }
+ },
+ {
+ "name": "gnomADe:remaining",
+ "fields": {
+ "af": "AF_remaining",
+ "ac": "AC_remaining",
+ "an": "AN_remaining"
+ }
+ },
+ {
+ "name": "gnomADe:sas",
+ "fields": {
+ "af": "AF_sas",
+ "ac": "AC_sas",
+ "an": "AN_sas"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "gnomAD_genomes",
+ "version": "4.1",
+ "representative": true,
+ "files": [
+ {
+ "version": "4.1",
+ "short_name": "gnomAD_genomes",
+ "representative_af_field": "AF",
+ "file_location": "/nfs/production/flicek/ensembl/production/ensemblftp/rapid-release/species/Homo_sapiens/##ASSEMBLY_ACC##/ensembl/variation/2022_10/vcf/2024_07/gnomad.genomes.v4.1.sites.##ASSEMBLY_ACC##.trimmed_liftover.vcf.gz",
+ "include_fields": [
+ {
+ "name": "gnomADg:ALL",
+ "fields": {
+ "af": "AF",
+ "ac": "AC",
+ "an": "AN"
+ }
+ },
+ {
+ "name": "gnomADg:afr",
+ "fields": {
+ "af": "AF_afr",
+ "ac": "AC_afr",
+ "an": "AN_afr"
+ }
+ },
+ {
+ "name": "gnomADg:ami",
+ "fields": {
+ "af": "AF_ami",
+ "ac": "AC_ami",
+ "an": "AN_ami"
+ }
+ },
+ {
+ "name": "gnomADg:amr",
+ "fields": {
+ "af": "AF_amr",
+ "ac": "AC_amr",
+ "an": "AN_amr"
+ }
+ },
+ {
+ "name": "gnomADg:asj",
+ "fields": {
+ "af": "AF_asj",
+ "ac": "AC_asj",
+ "an": "AN_asj"
+ }
+ },
+ {
+ "name": "gnomADg:eas",
+ "fields": {
+ "af": "AF_eas",
+ "ac": "AC_eas",
+ "an": "AN_eas"
+ }
+ },
+ {
+ "name": "gnomADg:fin",
+ "fields": {
+ "af": "AF_fin",
+ "ac": "AC_fin",
+ "an": "AN_fin"
+ }
+ },
+ {
+ "name": "gnomADg:mid",
+ "fields": {
+ "af": "AF_mid",
+ "ac": "AC_mid",
+ "an": "AN_mid"
+ }
+ },
+ {
+ "name": "gnomADg:nfe",
+ "fields": {
+ "af": "AF_nfe",
+ "ac": "AC_nfe",
+ "an": "AN_nfe"
+ }
+ },
+ {
+ "name": "gnomADg:remaining",
+ "fields": {
+ "af": "AF_remaining",
+ "ac": "AC_remaining",
+ "an": "AN_remaining"
+ }
+ },
+ {
+ "name": "gnomADg:sas",
+ "fields": {
+ "af": "AF_sas",
+ "ac": "AC_sas",
+ "an": "AN_sas"
+ }
+ }
+ ]
+ }
+ ]
+ }
+ ],
+ "homo_sapiens_gca018[458][05-7][0-79]\\d{2}5v1": [
{
"name": "gnomAD_exomes",
"version": "4.1",
@@ -1071,6 +1269,25 @@
}
],
"triticum_aestivum": [
+ {
+ "name": "MAGIC-16-Diversity",
+ "files": [
+ {
+ "file_location": "/nfs/production/flicek/ensembl/variation/new_website/vep/custom_data/frequency_projects/triticum_aestivum/IWGSC/MAGIC.vcf.gz",
+ "short_name": "MAGIC-16-Diversity",
+ "include_fields": [
+ {
+ "name": "MAGIC-16-Diversity",
+ "fields": {
+ "af": "AF",
+ "ac": "AC",
+ "an": "AN"
+ }
+ }
+ ]
+ }
+ ]
+ },
{
"name": "Axiom",
"files": [
diff --git a/nextflow/vcf_prepper/assets/short_variant_bedfields.json b/nextflow/vcf_prepper/assets/short_variant_bedfields.json
new file mode 100644
index 00000000..70757046
--- /dev/null
+++ b/nextflow/vcf_prepper/assets/short_variant_bedfields.json
@@ -0,0 +1,42 @@
+{
+ "name": "short_variant",
+ "description": "BED fields for short variant track in genome browser",
+ "fields": {
+ "chromosome": {
+ "type": "string",
+ "pos": 1
+ },
+ "start": {
+ "type": "uint",
+ "pos": 2
+ },
+ "end": {
+ "type": "uint",
+ "pos": 3
+ },
+ "id": {
+ "type": "string",
+ "pos": 4
+ },
+ "variety": {
+ "type": "lstring",
+ "pos": 5
+ },
+ "reference": {
+ "type": "lstring",
+ "pos": 6
+ },
+ "alts": {
+ "type": "lstring",
+ "pos": 7
+ },
+ "group": {
+ "type": "ubyte",
+ "pos": 8
+ },
+ "severity": {
+ "type": "lstring",
+ "pos": 9
+ }
+ }
+}
diff --git a/nextflow/vcf_prepper/assets/sources_meta.json b/nextflow/vcf_prepper/assets/sources_meta.json
index 1c173ffe..fcf22386 100755
--- a/nextflow/vcf_prepper/assets/sources_meta.json
+++ b/nextflow/vcf_prepper/assets/sources_meta.json
@@ -1,149 +1,160 @@
[
- {
- "name": "Ensembl",
- "version": "110",
- "description": "Ensembl",
- "url": "https://beta.ensembl.org",
- "accession_url": "https://beta.ensembl.org/"
- },
- {
- "name": "dbSNP",
- "version": "157",
- "description": "NCBI db of human variants",
- "url": "https://www.ncbi.nlm.nih.gov/snp/",
- "accession_url": "https://www.ncbi.nlm.nih.gov/snp/"
- },
- {
- "name": "EVA",
- "version": "7",
- "description": "European Variation Archive",
- "url": "https://www.ebi.ac.uk/eva",
- "accession_url": "https://www.ebi.ac.uk/eva/?variant&accessionID="
- },
- {
- "name": "The_1001_Genomes_Project",
- "version": "2016",
- "description": "The 1001 Genomes Project provides a catalog of genetic variation in the reference plant Arabidopsis thaliana. Launched at the beginning of 2008 to discover sequence variation in at least 1001 strains (accessions). Since 2010, several sets of genomes have been released. The first phase of the project was completed in 2016 with publication of a detailed analysis of 1,135 genomes.",
- "url": "http://1001genomes.org/"
- },
- {
- "name": "Nordborg",
- "description": "Variation annotations from Nordborg"
- },
- {
- "name": "IBSC",
- "version": "0",
- "description": "International Barley Sequencing Consortium, remapped to MorexV3_pseudomolecules_assembly",
- "url": "http://europepmc.org/abstract/MED/23075845"
- },
- {
- "name": "Ensembl_Plants",
- "version": "0",
- "description": "Previous release of Ensembl Plants"
- },
- {
- "name": "iSelect",
- "version": "0",
- "description": "Illumina iSelect SNP chip IDs",
- "url": "http://bioinf.hutton.ac.uk/iselect/app/"
- },
- {
- "name": "JHI",
- "version": "0",
- "description": "James Hutton Institute, Dundee, United Kingdom",
- "url": "https://ics.hutton.ac.uk/50k/"
- },
- {
- "name": "Gramene_QTLdb",
- "version": "1",
- "description": "QTLs from the legacy Gramene QTL Database",
- "url": "http://archive.gramene.org/db/qtl/"
- },
- {
- "name": "gramene-marker",
- "version": "1",
- "description": "gramene markers",
- "url": "http://archive.gramene.org/markers"
- },
- {
- "name": "Qtaro_QTLdb",
- "version": "1",
- "description": "QTLs from the Qtaro Database and remapped to IRGSPv1 assembly by keygen",
- "url": "http://qtaro.abr.affrc.go.jp/qtab/table"
- },
- {
- "name": "The_150_TGRSP",
- "version": "1",
- "description": "The 150 Tomato Genome ReSequencing Project (150 TGRSP) is revealing the genetic variation available in tomato, one of the most important vegetable crops, for breeding in the face of climate change, sustainability, and demand for more and better food",
- "url": "http://www.tomatogenome.net/"
- },
- {
- "name": "Addo-Quayes_study",
- "description": "Whole-Genome Sequence Accuracy Is Improved by Replication in a Population of EMS Mutagenized Sorghum"
- },
- {
- "name": "Yinping-Jiao-2016-Study",
- "description": "(Recalls projected against sorghum bicolor v3.1) EMS induced mutations in Sorghum as an Efficient Platform for Gene Discovery in Grasses"
- },
- {
- "name": "Lozano_study",
- "description": "Whole genome resequencing of 499 sorghum lines from three sources - Mace (42), TERRA-MEPP (254), and TERRA-REF (203) to study deleterious mutations in sorghum."
- },
- {
- "name": "Sorghum_QTL_Atlas",
- "version": "1",
- "description": "QTLs from the the Sorghum QTL Atlas Database developed by Emma Mace group",
- "url": "https://aussorgm.org.au/sorghum-qtl-atlas/"
- },
- {
- "name": "Lozano_study",
- "version": "1",
- "description": "QTLs from the Qtaro Database and remapped to IRGSPv1 assembly by keygen",
- "url": "http://qtaro.abr.affrc.go.jp/qtab/table"
- },
- {
- "name": "CerealsDB",
- "description": "Markers from Axiom 820K and 35K SNP Array provided by CerealsDBn"
- },
- {
- "name": "EMS-induced mutation",
- "description": "EMS-induced mutations from sequenced wheat TILLING populations. Seeds can be ordered from UK SeedStor or US Dubcovsky lab. Line identifier is variant name up to dot (e.g Kronos3128)."
- },
- {
- "name": "Inter-homoeologous",
- "description": "Inter-Homoeologous Variants (IHVs) called by alignments of the A,B and D component genomes"
- },
- {
- "name": "Nottingham_WRC",
- "description": "Markers from Axiom Wheat Relatives Array provided by CerealsDB and from Grewal et al., 2020"
- },
- {
- "name": "Watkins-exome-capture",
- "description": "103 accessions of the Watkins landrace collection"
- },
- {
- "name": "Exome_Capture_Diversity",
- "description": "890 diverse wheat landraces and cultivars from He et al"
- },
- {
- "name": "Watkins_collection",
- "description": "Variants from the the Watkins Collection which is a diverse assortment of heritage wheat varieties from the 1930s, preserved for its unique genetic potential in modern breeding"
- },
- {
- "name": "CNR-ITB",
- "description": "Markers from Axiom 820K, 35K and iSelect 90K SNP Infinium are provided by CerealsDB. Markers from TaBW280K Affymetrix arrary refer to Rimbert, H. et al."
- },
- {
- "name": "CSHL/Cornell",
- "version": "1",
- "description": "Cold Spring Harbor Laboratory / Cornell University [remapped to PN40024.v4]"
- },
- {
- "name": "HapMap2",
- "description": "Remapped to Zm-B73-REFERENCE-NAM-5.0"
- },
- {
- "name": "Panzea_2.7GBS",
- "description": "Remapped to Zm-B73-REFERENCE-NAM-5.0"
- }
-]
\ No newline at end of file
+ {
+ "name": "Ensembl",
+ "version": "110",
+ "description": "Ensembl",
+ "url": "https://beta.ensembl.org",
+ "accession_url": "https://beta.ensembl.org/"
+ },
+ {
+ "name": "dbSNP",
+ "version": "157",
+ "description": "NCBI db of human variants",
+ "url": "https://www.ncbi.nlm.nih.gov/snp/",
+ "accession_url": "https://www.ncbi.nlm.nih.gov/snp/"
+ },
+ {
+ "name": "EVA",
+ "version": "7",
+ "description": "European Variation Archive",
+ "url": "https://www.ebi.ac.uk/eva",
+ "accession_url": "https://www.ebi.ac.uk/eva/?variant&accessionID="
+ },
+ {
+ "name": "The_1001_Genomes_Project",
+ "version": "2016",
+ "description": "The 1001 Genomes Project provides a catalog of genetic variation in the reference plant Arabidopsis thaliana. Launched at the beginning of 2008 to discover sequence variation in at least 1001 strains (accessions). Since 2010, several sets of genomes have been released. The first phase of the project was completed in 2016 with publication of a detailed analysis of 1,135 genomes.",
+ "url": "http://1001genomes.org/"
+ },
+ {
+ "name": "Nordborg",
+ "description": "Variation annotations from Nordborg"
+ },
+ {
+ "name": "IBSC",
+ "version": "0",
+ "description": "International Barley Sequencing Consortium, remapped to MorexV3_pseudomolecules_assembly",
+ "url": "http://europepmc.org/abstract/MED/23075845"
+ },
+ {
+ "name": "Ensembl_Plants",
+ "version": "0",
+ "description": "Previous release of Ensembl Plants"
+ },
+ {
+ "name": "iSelect",
+ "version": "0",
+ "description": "Illumina iSelect SNP chip IDs",
+ "url": "http://bioinf.hutton.ac.uk/iselect/app/"
+ },
+ {
+ "name": "JHI",
+ "version": "0",
+ "description": "James Hutton Institute, Dundee, United Kingdom",
+ "url": "https://ics.hutton.ac.uk/50k/"
+ },
+ {
+ "name": "Gramene_QTLdb",
+ "version": "1",
+ "description": "QTLs from the legacy Gramene QTL Database",
+ "url": "http://archive.gramene.org/db/qtl/"
+ },
+ {
+ "name": "gramene-marker",
+ "version": "1",
+ "description": "gramene markers",
+ "url": "http://archive.gramene.org/markers"
+ },
+ {
+ "name": "Qtaro_QTLdb",
+ "version": "1",
+ "description": "QTLs from the Qtaro Database and remapped to IRGSPv1 assembly by keygen",
+ "url": "http://qtaro.abr.affrc.go.jp/qtab/table"
+ },
+ {
+ "name": "The_150_TGRSP",
+ "version": "1",
+ "description": "The 150 Tomato Genome ReSequencing Project (150 TGRSP) is revealing the genetic variation available in tomato, one of the most important vegetable crops, for breeding in the face of climate change, sustainability, and demand for more and better food",
+ "url": "http://www.tomatogenome.net/"
+ },
+ {
+ "name": "Addo-Quayes_study",
+ "description": "Whole-Genome Sequence Accuracy Is Improved by Replication in a Population of EMS Mutagenized Sorghum"
+ },
+ {
+ "name": "Yinping-Jiao-2016-Study",
+ "description": "(Recalls projected against sorghum bicolor v3.1) EMS induced mutations in Sorghum as an Efficient Platform for Gene Discovery in Grasses"
+ },
+ {
+ "name": "Lozano_study",
+ "description": "Whole genome resequencing of 499 sorghum lines from three sources - Mace (42), TERRA-MEPP (254), and TERRA-REF (203) to study deleterious mutations in sorghum."
+ },
+ {
+ "name": "Sorghum_QTL_Atlas",
+ "version": "1",
+ "description": "QTLs from the the Sorghum QTL Atlas Database developed by Emma Mace group",
+ "url": "https://aussorgm.org.au/sorghum-qtl-atlas/"
+ },
+ {
+ "name": "Lozano_study",
+ "version": "1",
+ "description": "QTLs from the Qtaro Database and remapped to IRGSPv1 assembly by keygen",
+ "url": "http://qtaro.abr.affrc.go.jp/qtab/table"
+ },
+ {
+ "name": "CerealsDB",
+ "description": "Markers from Axiom 820K and 35K SNP Array provided by CerealsDBn"
+ },
+ {
+ "name": "EMS-induced mutation",
+ "description": "EMS-induced mutations from sequenced wheat TILLING populations. Seeds can be ordered from UK SeedStor or US Dubcovsky lab. Line identifier is variant name up to dot (e.g Kronos3128)."
+ },
+ {
+ "name": "Inter-homoeologous",
+ "description": "Inter-Homoeologous Variants (IHVs) called by alignments of the A,B and D component genomes"
+ },
+ {
+ "name": "Nottingham_WRC",
+ "description": "Markers from Axiom Wheat Relatives Array provided by CerealsDB and from Grewal et al., 2020"
+ },
+ {
+ "name": "Watkins-exome-capture",
+ "description": "103 accessions of the Watkins landrace collection"
+ },
+ {
+ "name": "Exome_Capture_Diversity",
+ "description": "890 diverse wheat landraces and cultivars from He et al"
+ },
+ {
+ "name": "Watkins_collection",
+ "description": "Variants from the the Watkins Collection which is a diverse assortment of heritage wheat varieties from the 1930s, preserved for its unique genetic potential in modern breeding"
+ },
+ {
+ "name": "ENA",
+ "description": "Variants from the European Nucleotide Archive (ENA)"
+ },
+ {
+ "name": "CNR-ITB",
+ "description": "Markers from Axiom 820K, 35K and iSelect 90K SNP Infinium are provided by CerealsDB. Markers from TaBW280K Affymetrix arrary refer to Rimbert, H. et al."
+ },
+ {
+ "name": "CSHL/Cornell",
+ "version": "1",
+ "description": "Cold Spring Harbor Laboratory / Cornell University [remapped to PN40024.v4]"
+ },
+ {
+ "name": "HapMap2",
+ "description": "Remapped to Zm-B73-REFERENCE-NAM-5.0"
+ },
+ {
+ "name": "Panzea_2.7GBS",
+ "description": "Remapped to Zm-B73-REFERENCE-NAM-5.0"
+ },
+ {
+ "name": "ClinVar",
+ "version": "2025-11-09",
+ "description": "ClinVar is a free public archive of reports of the relationships between human variations and phenotypes.",
+ "url": "https://www.ncbi.nlm.nih.gov/clinvar/",
+ "accession_url": "https://www.ncbi.nlm.nih.gov/clinvar/variation/"
+ }
+]
diff --git a/nextflow/vcf_prepper/assets/structural_variant_bedfields.json b/nextflow/vcf_prepper/assets/structural_variant_bedfields.json
new file mode 100644
index 00000000..f6521b13
--- /dev/null
+++ b/nextflow/vcf_prepper/assets/structural_variant_bedfields.json
@@ -0,0 +1,46 @@
+{
+ "name": "structural_variant",
+ "description": "BED fields for structural variant track in genome browser",
+ "fields": {
+ "chromosome": {
+ "type": "string",
+ "pos": 1
+ },
+ "start": {
+ "type": "uint",
+ "pos": 2
+ },
+ "end": {
+ "type": "uint",
+ "pos": 3
+ },
+ "id": {
+ "type": "string",
+ "pos": 4
+ },
+ "variety": {
+ "type": "lstring",
+ "pos": 5
+ },
+ "reference": {
+ "type": "lstring",
+ "pos": 6
+ },
+ "alts": {
+ "type": "lstring",
+ "pos": 7
+ },
+ "group": {
+ "type": "ubyte",
+ "pos": 8
+ },
+ "severity": {
+ "type": "lstring",
+ "pos": 9
+ },
+ "extent": {
+ "type": "int",
+ "pos": 10
+ }
+ }
+}
diff --git a/nextflow/vcf_prepper/assets/variation_consequnce_rank.json b/nextflow/vcf_prepper/assets/variation_consequnce_rank.json
index dc166f49..6c96dd26 100644
--- a/nextflow/vcf_prepper/assets/variation_consequnce_rank.json
+++ b/nextflow/vcf_prepper/assets/variation_consequnce_rank.json
@@ -1,43 +1,43 @@
{
- "feature_truncation" : "10",
- "incomplete_terminal_codon_variant" : "19",
- "inframe_insertion" : "11",
- "regulatory_region_variant" : "39",
- "3_prime_UTR_variant" : "26",
- "stop_gained" : "4",
"stop_retained_variant" : "21",
+ "frameshift_variant" : "5",
+ "intergenic_variant" : "40",
+ "5_prime_UTR_variant" : "25",
"regulatory_region_amplification" : "38",
- "transcript_amplification" : "8",
- "protein_altering_variant" : "14",
- "transcript_ablation" : "1",
- "splice_acceptor_variant" : "2",
- "missense_variant" : "13",
- "coding_sequence_variant" : "23",
- "inframe_deletion" : "12",
+ "TFBS_ablation" : "34",
+ "synonymous_variant" : "22",
+ "splice_donor_region_variant" : "17",
"feature_elongation" : "9",
- "TF_binding_site_variant" : "36",
- "5_prime_UTR_variant" : "25",
+ "coding_transcript_variant" : "31",
"sequence_variant" : "41",
- "mature_miRNA_variant" : "24",
- "NMD_transcript_variant" : "29",
- "synonymous_variant" : "22",
- "regulatory_region_ablation" : "37",
+ "non_coding_transcript_exon_variant" : "27",
"splice_donor_5th_base_variant" : "15",
+ "stop_gained" : "4",
+ "incomplete_terminal_codon_variant" : "19",
+ "TF_binding_site_variant" : "36",
+ "protein_altering_variant" : "14",
+ "NMD_transcript_variant" : "29",
+ "feature_truncation" : "10",
+ "splice_donor_variant" : "3",
+ "inframe_insertion" : "11",
+ "transcript_ablation" : "1",
+ "missense_variant" : "13",
"splice_region_variant" : "16",
- "non_coding_transcript_variant" : "30",
- "splice_donor_region_variant" : "17",
- "intergenic_variant" : "40",
- "TFBS_ablation" : "34",
+ "TFBS_amplification" : "35",
+ "coding_sequence_variant" : "23",
+ "splice_polypyrimidine_tract_variant" : "18",
"start_lost" : "7",
+ "3_prime_UTR_variant" : "26",
"stop_lost" : "6",
- "splice_donor_variant" : "3",
- "TFBS_amplification" : "35",
- "downstream_gene_variant" : "33",
- "non_coding_transcript_exon_variant" : "27",
- "upstream_gene_variant" : "32",
- "frameshift_variant" : "5",
"start_retained_variant" : "20",
- "coding_transcript_variant" : "31",
+ "transcript_amplification" : "8",
+ "mature_miRNA_variant" : "24",
+ "inframe_deletion" : "12",
+ "splice_acceptor_variant" : "2",
+ "regulatory_region_variant" : "39",
+ "non_coding_transcript_variant" : "30",
+ "upstream_gene_variant" : "32",
+ "downstream_gene_variant" : "33",
"intron_variant" : "28",
- "splice_polypyrimidine_tract_variant" : "18"
+ "regulatory_region_ablation" : "37"
}
diff --git a/nextflow/vcf_prepper/bin/generate_vep_config.py b/nextflow/vcf_prepper/bin/generate_vep_config.py
index b3f35a72..b1e9e737 100755
--- a/nextflow/vcf_prepper/bin/generate_vep_config.py
+++ b/nextflow/vcf_prepper/bin/generate_vep_config.py
@@ -554,6 +554,7 @@ def main(args=None):
"[ERROR] Cannot use both cache and gff at the same time, but both given."
)
+ cache_version = version
if args.cache_dir:
cache_dir = args.cache_dir
cache_version = get_relative_version(version, division)
@@ -635,10 +636,13 @@ def main(args=None):
file.write("variant_class 1\n")
file.write("protein 1\n")
file.write("transcript_version 1\n")
- file.write("gencode_primary 1\n")
+ file.write("allele_number 1\n")
+
+ if species == "homo_sapiens" and assembly == "GRCh38":
+ file.write("gencode_primary 1\n")
if args.cache_dir:
- file.write(f"cache_version {version}\n")
+ file.write(f"cache_version {cache_version}\n")
file.write(f"cache {args.cache_dir}\n")
file.write("offline 1\n")
elif args.gff_dir:
diff --git a/nextflow/vcf_prepper/bin/summary_stats.py b/nextflow/vcf_prepper/bin/summary_stats.py
index 764f3edd..2d45c60e 100755
--- a/nextflow/vcf_prepper/bin/summary_stats.py
+++ b/nextflow/vcf_prepper/bin/summary_stats.py
@@ -140,36 +140,6 @@ def header_match(want_header: dict, got_header: dict) -> bool:
)
-def minimise_allele(ref: str, alts: list) -> str:
- """Minimise allele representation by trimming a common leading base.
-
- If all alleles share the same first base, remove it from REF and ALT alleles; an empty
- allele becomes '-' to preserve representation.
-
- Args:
- ref (str): Reference allele.
- alts (list): List of alternate alleles.
-
- Returns:
- tuple: (minimised_ref, minimised_alts) where minimised_alts is a list of strings.
- """
- alleles = [ref] + alts
- first_bases = {allele[0] for allele in alleles}
-
- if len(first_bases) == 1:
- ref = ref[1:] or "-"
-
- temp_alts = []
- for alt in alts:
- if "*" in alt:
- temp_alts.append(alt)
- else:
- temp_alts.append(alt[1:] or "-")
- alts = temp_alts
-
- return (ref, alts)
-
-
def main(args=None):
"""Compute and add summary INFO fields to VCF records.
@@ -292,7 +262,11 @@ def main(args=None):
# iterate through the file
for variant in input_vcf:
# create minimalized allele order
- (ref, allele_order) = minimise_allele(variant.REF, variant.ALT)
+ num_of_alleles = len(variant.ALT)
+
+ if "ALLELE_NUM" not in csq_header_idx and num_of_alleles > 1:
+ print("[ERROR] INFO/CSQ must contain ALLELE_NUM for input with multi-allelic variants")
+ exit(1)
items_per_variant = {item: set() for item in PER_VARIANT_FIELDS}
items_per_allele = {}
@@ -302,9 +276,9 @@ def main(args=None):
for csq in csqs.split(","):
csq_values = csq.split("|")
- allele = csq_values[csq_header_idx["Allele"]]
- if allele not in items_per_allele:
- items_per_allele[allele] = {item: set() for item in PER_ALLELE_FIELDS}
+ allele_num = csq_values[csq_header_idx.get("ALLELE_NUM", 1)]
+ if allele_num not in items_per_allele:
+ items_per_allele[allele_num] = {item: set() for item in PER_ALLELE_FIELDS}
consequences = csq_values[csq_header_idx["Consequence"]]
feature_stable_id = csq_values[csq_header_idx["Feature"]]
@@ -322,16 +296,16 @@ def main(args=None):
if add_transcript_feature:
# genes
gene = csq_values[csq_header_idx["Gene"]]
- items_per_allele[allele]["gene"].add(gene)
+ items_per_allele[allele_num]["gene"].add(gene)
# transcipt consequences
- items_per_allele[allele]["transcipt_consequence"].add(
+ items_per_allele[allele_num]["transcipt_consequence"].add(
f"{feature_stable_id}:{consequences}"
)
# regualtory consequences
if add_regulatory_feature:
- items_per_allele[allele]["regulatory_consequence"].add(
+ items_per_allele[allele_num]["regulatory_consequence"].add(
f"{feature_stable_id}:{consequences}"
)
@@ -346,11 +320,11 @@ def main(args=None):
(name, source, feature) = pheno_per_allele_fields
if feature.startswith("ENS"):
- items_per_allele[allele]["gene_phenotype"].add(
+ items_per_allele[allele_num]["gene_phenotype"].add(
f"{name}:{source}:{feature}"
)
else:
- items_per_allele[allele]["variant_phenotype"].add(
+ items_per_allele[allele_num]["variant_phenotype"].add(
f"{name}:{source}:{feature}"
)
@@ -422,14 +396,16 @@ def main(args=None):
exit(1)
if len(frequencies) == 1:
- items_per_allele[allele]["frequency"] = frequencies[0]
+ items_per_allele[allele_num]["frequency"] = frequencies[0]
# create summary info for per allele fields
for field in PER_ALLELE_FIELDS:
field_nums = []
- for allele in allele_order:
- if allele in items_per_allele and field in items_per_allele[allele]:
- field_len = len(items_per_allele[allele][field])
+ for allele_num in range(1, num_of_alleles + 1):
+ allele_num = str(allele_num)
+
+ if allele_num in items_per_allele and field in items_per_allele[allele_num]:
+ field_len = len(items_per_allele[allele_num][field])
if field_len > 0:
field_nums.append(str(field_len))
@@ -438,9 +414,11 @@ def main(args=None):
# create summary info for frequency
field_vals = []
- for allele in allele_order:
- if allele in items_per_allele and "frequency" in items_per_allele[allele]:
- field_vals.append(items_per_allele[allele]["frequency"])
+ for allele_num in range(1, num_of_alleles + 1):
+ allele_num = str(allele_num)
+
+ if allele_num in items_per_allele and "frequency" in items_per_allele[allele_num]:
+ field_vals.append(items_per_allele[allele_num]["frequency"])
else:
field_vals.append(".")
diff --git a/nextflow/vcf_prepper/bin/update_fields.py b/nextflow/vcf_prepper/bin/update_fields.py
index ce7119ee..3146ef52 100755
--- a/nextflow/vcf_prepper/bin/update_fields.py
+++ b/nextflow/vcf_prepper/bin/update_fields.py
@@ -26,6 +26,7 @@
##INFO=
##INFO=
##INFO=
+##INFO=
"""
HEADER = """#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
"""
@@ -234,11 +235,10 @@ def main(args=None):
break
info_fields = f"SOURCE={variant_source}"
- if variant.INFO.get("SVLEN"):
- info_fields += f";SVLEN={variant.INFO['SVLEN']}"
-
- if variant.INFO.get("END"):
- info_fields += f";END={variant.INFO['END']}"
+ retainable_info = ["SVLEN", "END", "NODEID"]
+ for info_field in retainable_info:
+ if variant.INFO.get(info_field):
+ info_fields += f";{info_field}={variant.INFO[info_field]}"
o_file.write(
"\t".join(
diff --git a/nextflow/vcf_prepper/bin/vcf_to_bed b/nextflow/vcf_prepper/bin/vcf_to_bed
index a8c62063..976dd97c 100755
Binary files a/nextflow/vcf_prepper/bin/vcf_to_bed and b/nextflow/vcf_prepper/bin/vcf_to_bed differ
diff --git a/nextflow/vcf_prepper/modules/local/bed_to_bigbed.nf b/nextflow/vcf_prepper/modules/local/bed_to_bigbed.nf
index 0d310ba5..548edb3d 100755
--- a/nextflow/vcf_prepper/modules/local/bed_to_bigbed.nf
+++ b/nextflow/vcf_prepper/modules/local/bed_to_bigbed.nf
@@ -18,7 +18,6 @@
process BED_TO_BIGBED {
- label 'process_high'
input:
tuple val(meta), path(bed)
@@ -30,15 +29,23 @@ process BED_TO_BIGBED {
source = meta.source.toLowerCase()
output_bb = "${meta.genome_tracks_outdir}/variant-${source}-details.bb"
chrom_sizes = meta.chrom_sizes
- // structural variant has extent as 10th column
- type = params.structural_variant ? "-type=bed3+7" : "-type=bed3+6"
+ bed_fields = params.bed_fields
'''
- bedToBigBed !{type} !{bed} !{chrom_sizes} !{output_bb}
+ total_fields=$(cat !{bed_fields} | jq .fields | jq length)
+ extra_fields=$((total_fields-3))
+ if [ "$extra_fields" -lt "0" ]
+ then
+ echo "Extra fields cannot be less than 0"
+ exit 1
+ fi
+
+ type="-type=bed3+${extra_fields}"
+ bedToBigBed ${type} !{bed} !{chrom_sizes} !{output_bb}
ln -sf !{output_bb} "variant-!{source}-details.bb"
# temp: for one source we create symlink for focus if only one source present
- if [[ ! !{meta.multiple_source} ]]
+ if [[ "!{meta.multiple_source}" == "false" ]]
then
cd !{meta.genome_tracks_outdir}
ln -sf variant-!{source}-details.bb variant-details.bb
diff --git a/nextflow/vcf_prepper/modules/local/index_vcf.nf b/nextflow/vcf_prepper/modules/local/index_vcf.nf
index dfb1a891..e514aa8a 100755
--- a/nextflow/vcf_prepper/modules/local/index_vcf.nf
+++ b/nextflow/vcf_prepper/modules/local/index_vcf.nf
@@ -33,6 +33,6 @@ process INDEX_VCF {
'''
ln -sf !{vcf} !{new_vcf}
- bcftools index !{flag_index} !{new_vcf}
+ bcftools index -f !{flag_index} !{new_vcf}
'''
}
diff --git a/nextflow/vcf_prepper/modules/local/process_gff.nf b/nextflow/vcf_prepper/modules/local/process_gff.nf
index fdc607bd..334d72de 100755
--- a/nextflow/vcf_prepper/modules/local/process_gff.nf
+++ b/nextflow/vcf_prepper/modules/local/process_gff.nf
@@ -29,7 +29,7 @@ process PROCESS_GFF {
shell:
genome = meta.genome
genome_uuid = meta.genome_uuid
- release_id = meta.release_id
+ release_id = params.release_id
version = params.version
out_dir = meta.genome_temp_dir
ini_file = params.ini_file
diff --git a/nextflow/vcf_prepper/modules/local/vcf_to_bed.nf b/nextflow/vcf_prepper/modules/local/vcf_to_bed.nf
index fa0aa4bd..efa739df 100755
--- a/nextflow/vcf_prepper/modules/local/vcf_to_bed.nf
+++ b/nextflow/vcf_prepper/modules/local/vcf_to_bed.nf
@@ -26,10 +26,16 @@ process VCF_TO_BED {
shell:
output_file = vcf.getName().replace(".vcf.gz", ".bed")
- structural_variant = params.structural_variant
+ bed_fields = params.bed_fields
+ structural_variant = params.structural_variant ? "--structural-variant" : ""
'''
- vcf_to_bed !{vcf} !{output_file} !{rank_file} !{structural_variant}
+ vcf_to_bed \
+ --vcf !{vcf} \
+ --output !{output_file} \
+ --rank !{rank_file} \
+ --bed-fields !{bed_fields} \
+ !{structural_variant}
rm !{vcf}
'''
diff --git a/nextflow/vcf_prepper/modules/local/wig_to_bigwig.nf b/nextflow/vcf_prepper/modules/local/wig_to_bigwig.nf
index 601a1cad..90edb498 100755
--- a/nextflow/vcf_prepper/modules/local/wig_to_bigwig.nf
+++ b/nextflow/vcf_prepper/modules/local/wig_to_bigwig.nf
@@ -23,8 +23,8 @@ process WIG_TO_BIGWIG {
output:
path "variant-${source}-summary.bw"
- memory { (wig.size() * 10.B + 1.GB) * task.attempt }
- time { 48.hour * task.attempt }
+ memory { (wig.size() * 7.B + 1.GB) * task.attempt }
+ time { 2.hour * task.attempt }
shell:
source = meta.source.toLowerCase()
@@ -40,7 +40,7 @@ process WIG_TO_BIGWIG {
ln -sf !{output_bw} "variant-!{source}-summary.bw"
# temp: for one source we create symlink for focus if only one source present
- if [[ ! !{meta.multiple_source} ]]
+ if [[ "!{meta.multiple_source}" == "false" ]]
then
cd !{meta.genome_tracks_outdir}
ln -sf variant-!{source}-summary.bw variant-summary.bw
diff --git a/nextflow/vcf_prepper/nextflow.config b/nextflow/vcf_prepper/nextflow.config
index fa6488b5..46eb0be3 100755
--- a/nextflow/vcf_prepper/nextflow.config
+++ b/nextflow/vcf_prepper/nextflow.config
@@ -15,6 +15,7 @@ params {
rank_file = "${projectDir}/assets/variation_consequnce_rank.json"
population_data_file = null
sources_meta_file = "${projectDir}/assets/sources_meta.json"
+ bed_fields = "${projectDir}/assets/short_variant_bedfields.json"
// data directories
cache_dir = null
@@ -82,10 +83,16 @@ process {
memory = { 8.GB * task.attempt }
time = { 48.hour * task.attempt }
}
+
+ withName: 'BED_TO_BIGBED'{
+ cpus = { 1 * task.attempt }
+ memory = { 15.GB * task.attempt }
+ time = { 3.hour * task.attempt }
+ }
withName: 'runVEPonVCF'{
cpus = { 2 * task.attempt }
- memory = { 96.KB * params.bin_size * task.attempt + 1.GB }
+ memory = { 10.KB * params.bin_size + 1.GB }
time = { 0.00008.hour * params.bin_size * task.attempt + 1.hour }
}
diff --git a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.lock b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.lock
old mode 100755
new mode 100644
index a6d9ef27..a4c83dd0
--- a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.lock
+++ b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.lock
@@ -3,47 +3,155 @@
version = 3
[[package]]
-name = "adler"
-version = "1.0.2"
+name = "adler2"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
+[[package]]
+name = "anstream"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys",
+]
[[package]]
name = "cfg-if"
-version = "1.0.0"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
+
+[[package]]
+name = "clap"
+version = "4.5.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "crc32fast"
-version = "1.3.2"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [
"cfg-if",
]
[[package]]
name = "flate2"
-version = "1.0.25"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841"
+checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d"
dependencies = [
"crc32fast",
"miniz_oxide",
]
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
[[package]]
name = "itoa"
-version = "1.0.6"
+version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "memchr"
-version = "2.5.0"
+version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "minimal-lexical"
@@ -53,11 +161,11 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "miniz_oxide"
-version = "0.6.2"
+version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
- "adler",
+ "adler2",
]
[[package]]
@@ -72,56 +180,93 @@ dependencies = [
[[package]]
name = "once_cell"
-version = "1.17.1"
+version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
[[package]]
name = "proc-macro2"
-version = "1.0.51"
+version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
+checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
-version = "1.0.23"
+version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "ryu"
-version = "1.0.13"
+version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
-version = "1.0.152"
+version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
[[package]]
name = "serde_json"
-version = "1.0.94"
+version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea"
+checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
+ "memchr",
"ryu",
"serde",
+ "serde_core",
]
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
[[package]]
name = "syn"
-version = "1.0.109"
+version = "2.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
dependencies = [
"proc-macro2",
"quote",
@@ -130,18 +275,18 @@ dependencies = [
[[package]]
name = "thiserror"
-version = "1.0.38"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
@@ -150,9 +295,15 @@ dependencies = [
[[package]]
name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "vcf"
@@ -167,9 +318,90 @@ dependencies = [
[[package]]
name = "vcf_to_bed"
-version = "0.1.0"
+version = "0.2.0"
dependencies = [
+ "clap",
"flate2",
"serde_json",
"vcf",
]
+
+[[package]]
+name = "windows-link"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65"
+
+[[package]]
+name = "windows-sys"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.53.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d42b7b7f66d2a06854650af09cfdf8713e427a439c97ad65a6375318033ac4b"
+dependencies = [
+ "windows-link",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
diff --git a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.toml b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.toml
index c7bba79c..c1ffff53 100755
--- a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.toml
+++ b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "vcf_to_bed"
-version = "0.1.0"
+version = "0.2.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -9,3 +9,4 @@ edition = "2021"
vcf="*"
flate2="*"
serde_json="*"
+clap= {version="*", features=["derive"]}
\ No newline at end of file
diff --git a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/src/main.rs b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/src/main.rs
index 8e12d234..13f64a8d 100755
--- a/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/src/main.rs
+++ b/nextflow/vcf_prepper/src/rust/ensembl/vcf_to_bed/src/main.rs
@@ -14,9 +14,25 @@
* limitations under the License.
*/
-use std::{io::{BufReader,Write}, fs::File, env, process::exit, collections::HashMap, collections::HashSet};
-use vcf::{VCFError, VCFReader};
+use std::{io::{BufReader,Write}, fs::File, process::exit, collections::HashMap, collections::HashSet};
+use vcf::{VCFError, VCFReader, VCFHeader};
use flate2::read::MultiGzDecoder;
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+#[command(version)]
+struct Args {
+ #[arg(short, long, help="VCF file to convert")]
+ vcf: String,
+ #[arg(short, long, help="Output bed file")]
+ output: String,
+ #[arg(short, long, help="JSON file with consequence ranks")]
+ rank: String,
+ #[arg(short, long, help="JSON file specifying required bed fields with metadata")]
+ bed_fields: String,
+ #[arg(short, long, help="VCF contains structural variant")]
+ structural_variant: bool
+}
const VARIANTGROUP : [(&str, u8); 45] = [
("frameshift_variant", 1),
@@ -101,6 +117,21 @@ const SV_VARIANTGROUP : [(&str, u8); 32] = [
("insertion", 13)
];
+struct WritableFields {
+ chromosome: bool,
+ start: bool,
+ end: bool,
+ id: bool,
+ variety: bool,
+ reference: bool,
+ alts: bool,
+ group: bool,
+ severity: bool,
+ severity_rank: bool,
+ extent: bool,
+ short_alt: bool
+}
+
struct Line {
chromosome: String,
start: u64,
@@ -112,7 +143,8 @@ struct Line {
group: u8,
severity: String,
severity_rank: u8,
- extent: u64
+ extent: u64,
+ short_alt: bool
}
impl Line {
@@ -129,7 +161,7 @@ impl Line {
self.variety != other.variety
}
- fn merge(&mut self, mut more: Option, out: &mut File) {
+ fn merge(&mut self, mut more: Option, out: &mut File, wfields: &WritableFields) {
// merge new line if not empty (and a Line instance)
if let Some(ref mut more) = more {
if self.compatible(more) {
@@ -156,18 +188,28 @@ impl Line {
// print out the current line
if self.alts.len() > 0 {
let alts = Vec::from_iter(self.alts.clone());
- write!(out, "{} {} {} {} {} {} {} {} {}",
- self.chromosome, self.start, self.end,
- self.id, self.variety, self.reference,
- alts.join(","), self.group, self.severity
+ write!(out, "{} {} {} {} {} {} {} {}",
+ self.chromosome,
+ self.start,
+ self.end,
+ self.id,
+ self.variety,
+ self.reference,
+ alts.join(","),
+ self.group
).unwrap();
-
- if self.extent != 0 {
- write!(out, " {}\n", self.extent).unwrap();
+ // allowing only 3 configurable fields as I don't wanna write lot
+ // of if's - need an efficient way of doing it
+ if wfields.severity {
+ write!(out, " {}", self.severity).unwrap();
}
- else {
- write!(out, "\n").unwrap();
+ if wfields.extent {
+ write!(out, " {}", self.extent).unwrap();
}
+ if wfields.short_alt {
+ write!(out, " {}", self.short_alt).unwrap();
+ }
+ write!(out, "\n").unwrap();
}
// make the new Line as the current one
@@ -177,25 +219,95 @@ impl Line {
}
}
+fn gen_csq_info_idx(header: &VCFHeader) -> HashMap<&str, usize> {
+ let csq_info_desc = match header.info("CSQ".as_bytes()) {
+ Some(csq_info) => {
+ match str::from_utf8(csq_info.description) {
+ Ok(desc) => desc,
+ Err(_) => {
+ println!("[ERROR] cannot parse header to get INFO/CSQ");
+ exit(1);
+ }
+ }
+ },
+ None => {
+ println!("[ERROR] cannot parse header to get INFO/CSQ");
+ exit(1);
+ }
+ };
+ let csq_header_idx: HashMap<&str, usize> = match csq_info_desc.split("Format: ").nth(1) {
+ Some(csq_headers) => {
+ let mut i: usize = 0;
+ let mut tmp_csq_header_idx = HashMap::new();
+ for header in csq_headers.split("|") {
+ tmp_csq_header_idx.insert(header, i);
+ i += 1;
+ }
+
+ tmp_csq_header_idx
+ },
+ None => {
+ println!("[ERROR] cannot parse header to get INFO/CSQ");
+ exit(1);
+ }
+ };
+
+ return csq_header_idx;
+}
+
fn main() -> Result<(), VCFError> {
// read cli arguments
- let args = env::args().collect::>();
+ let args = Args::parse();
let mut reader = VCFReader::new(BufReader::new(MultiGzDecoder::new(File::open(
- &args[1]
+ args.vcf
)?)))?;
- let mut out = File::create(&args[2]).unwrap();
- let json = std::fs::read_to_string(&args[3]).unwrap();
+ let mut out = File::create(args.output).unwrap();
+ let rank = std::fs::read_to_string(args.rank).unwrap();
+ let bed_fields = std::fs::read_to_string(args.bed_fields).unwrap();
+ let is_sv = args.structural_variant;
- let mut is_sv = false;
- if args.len() == 5 && String::from(&args[4]) == "1" {
- is_sv = true;
- }
-
- let severity = {
- serde_json::from_str::>(&json).unwrap()
+ // generate Hashmap of INFO/CSQ header indexes and retrieve relavant header indexes
+ let header = reader.header();
+ let csq_header_idx: HashMap<&str, usize> = gen_csq_info_idx(&header);
+ let &csqh_conseq_idx = match csq_header_idx.get("Consequence") {
+ Some(idx) => idx,
+ None => {
+ println!("[ERROR] could not determine index of 'Consequence' field from INFO/CSQ");
+ exit(1);
+ }
+ };
+ let &csqh_variant_class_idx = match csq_header_idx.get("VARIANT_CLASS") {
+ Some(idx) => idx,
+ None => {
+ println!("[ERROR] could not determine index of 'VARIANT_CLASS' field from INFO/CSQ");
+ exit(1);
+ }
};
- // create the severity hash
+ // load bed fields json and map it to writable fields
+ let bfields = serde_json::from_str::(&bed_fields).unwrap();
+ let fields = bfields.get("fields").unwrap();
+ let wfields = WritableFields {
+ chromosome: fields.get("chromosome").is_some(),
+ start: fields.get("start").is_some(),
+ end: fields.get("end").is_some(),
+ id: fields.get("id").is_some(),
+ variety: fields.get("variety").is_some(),
+ reference: fields.get("reference").is_some(),
+ alts: fields.get("alts").is_some(),
+ group: fields.get("group").is_some(),
+ severity: fields.get("severity").is_some(),
+ severity_rank: fields.get("severity_rank").is_some(),
+ extent: fields.get("extent").is_some(),
+ short_alt: fields.get("short_alt").is_some()
+ };
+
+ // load consequence ranks hash
+ let csq_ranks = {
+ serde_json::from_str::>(&rank).unwrap()
+ };
+
+ // load variant group hash
let mut variant_groups = HashMap::new();
if is_sv {
for (csq, value) in &SV_VARIANTGROUP {
@@ -222,12 +334,11 @@ fn main() -> Result<(), VCFError> {
group: 0,
severity: "".to_string(),
severity_rank: 255,
- extent: 0
+ extent: 0,
+ short_alt: false
};
while reader.next_record(&mut record)? {
- let reference = String::from_utf8(record.reference.clone()).unwrap();
- let ref_len = reference.len() as u64;
-
+ // read id:
let mut multiple_ids = false;
let ids = record.id.iter().map(|b| {
String::from_utf8(b.clone())
@@ -243,19 +354,24 @@ fn main() -> Result<(), VCFError> {
);
continue;
}
+
+ // read ref allele:
+ let reference = String::from_utf8(record.reference.clone()).unwrap();
+ let ref_len = reference.len() as u64;
+ // read alt alleles:
let alts = record.alternative.iter().map(|a| {
String::from_utf8(a.clone())
}).collect::,_>>().unwrap();
- // TBD: replace hardcoded index value - .nth(1)
+ // read INFO/CSQ/Consequence:
let csq = record.info(b"CSQ").map(|csqs| {
csqs.iter().map(|csq| {
let s = String::from_utf8_lossy(csq);
- s.split("|").nth(1).unwrap_or("").to_string()
+ s.split("|").nth(csqh_conseq_idx).unwrap_or("").to_string()
}).collect::>()
}).unwrap_or(vec![]);
- // if csq is empty we won't have most severe consequence
+ // SKIP if csq is empty
if csq.is_empty(){
println!("[WARNING] INFO/CSQ empty - {0}:{1}, skipping",
String::from_utf8(record.chromosome.to_vec()).unwrap(),
@@ -264,21 +380,22 @@ fn main() -> Result<(), VCFError> {
continue;
}
- // TBD: replace hardcoded index value - .nth(21)
+ // INFO/CSQ/VARIANT_CLASS
let class = record.info(b"CSQ").map(|csqs| {
csqs.iter().map(|csq| {
let s = String::from_utf8_lossy(csq);
- s.split("|").nth(21).unwrap_or("").to_string()
+ s.split("|").nth(csqh_variant_class_idx).unwrap_or("").to_string()
}).collect::>()
}).unwrap_or(vec![]);
- // check for SV
+ // check for symbolic alt
let mut symbolic_alts = false;
+ let mut max_alt_len = 0;
for alt in alts.iter() {
- if alt.chars().any(|c| (
- c != 'A' && c != 'T' && c != 'C' && c != 'G' && c != 'N'
- && c != 'a' && c != 't' && c != 'c' && c != 'g' && c != 'n'
- )) {
+ if alt.chars().any(|c|
+ c != 'A' && c != 'T' && c != 'C' && c != 'G' && c != 'N' &&
+ c != 'a' && c != 't' && c != 'c' && c != 'g' && c != 'n'
+ ) {
symbolic_alts = true;
}
if !is_sv && symbolic_alts {
@@ -286,6 +403,11 @@ fn main() -> Result<(), VCFError> {
If you are running this script for structural variant please re-run setting the 4th argument field to 1.");
exit(1)
}
+
+ let alt_len = alt.len();
+ if alt_len > max_alt_len {
+ max_alt_len = alt_len;
+ }
}
for id in ids.iter() {
@@ -293,10 +415,12 @@ fn main() -> Result<(), VCFError> {
let mut most_severe_csq = "";
let mut most_severe_csq_rank = 255;
- // calculate most severe consequence and variant group of that consequence
+ /*
+ * calculate most severe consequence and variant group of that consequence
+ */
for csq_str in csq.iter() {
for csq_here in csq_str.split("&") {
- let csq_rank_here = (*severity.get(csq_here).unwrap_or(&String::from("0"))).parse::().unwrap();
+ let csq_rank_here = (*csq_ranks.get(csq_here).unwrap_or(&String::from("0"))).parse::().unwrap();
if csq_rank_here < most_severe_csq_rank {
variant_group = *variant_groups.get(csq_here).unwrap_or(&0);
most_severe_csq = csq_here;
@@ -305,7 +429,9 @@ fn main() -> Result<(), VCFError> {
}
}
- // calcualte variant class - we store it as variety
+ /*
+ * calculate variant class - we store it as variety
+ */
// variety should always be same for each variant allele - VEP puts variant class at variant level (using Bio::EnsEMBL::Variation::Utils::Sequence::SO_variation_class)
// if cannot be deduced the default value is - sequence_alteration
let mut variety = class[0].to_string();
@@ -344,6 +470,9 @@ fn main() -> Result<(), VCFError> {
}
}
+ /*
+ * calculate positions
+ */
// start position in bed is 0-indexed
let mut start = record.position - 1;
@@ -354,12 +483,22 @@ fn main() -> Result<(), VCFError> {
end = start;
}
- // for short variants we do not use extent
+ /*
+ * particular calculation for SV
+ * - update variant group and position for SVs
+ * - if non-symbolic set is_sv = false for alt bp < 50
+ * - calculate extent
+ */
let mut extent = 0;
-
+ let mut short_alt = false;
+
if is_sv {
// overwrite variant group to come from variant class instead of consequence
variant_group = *variant_groups.get(&variety).unwrap_or(&0);
+
+ if !symbolic_alts && max_alt_len < 50 {
+ short_alt = true;
+ }
// check INFO/SVLEN or INFO/END to get the end position
let svlens = record.info(b"SVLEN").map(|svlen| {
@@ -429,13 +568,14 @@ fn main() -> Result<(), VCFError> {
group: variant_group,
severity: most_severe_csq.to_string(),
severity_rank: most_severe_csq_rank,
- extent: extent
+ extent: extent,
+ short_alt: short_alt
};
- lines.merge(Some(more), &mut out);
+ lines.merge(Some(more), &mut out, &wfields);
}
}
- lines.merge(None, &mut out);
+ lines.merge(None, &mut out, &wfields);
Ok(())
}
\ No newline at end of file
diff --git a/nextflow/vcf_prepper/workflows/vcf_prepper.nf b/nextflow/vcf_prepper/workflows/vcf_prepper.nf
index f725b64c..fa0aaac2 100755
--- a/nextflow/vcf_prepper/workflows/vcf_prepper.nf
+++ b/nextflow/vcf_prepper/workflows/vcf_prepper.nf
@@ -64,7 +64,7 @@ def parse_config (config) {
// if source is MULTIPLE there are multiple sources; they must be listed in sources field in the input config
if (meta.source == "MULTIPLE"){
- meta.sources = source_data.sources.join(",")
+ meta.sources = source_datum.sources.join(",")
meta.sources = meta.sources.replaceAll(" ", "%20") // we cannot use whitespace in cmd argument
}
diff --git a/scripts/create_hprc_segdup_tracks.py b/scripts/create_hprc_segdup_tracks.py
new file mode 100644
index 00000000..5c6fc7e3
--- /dev/null
+++ b/scripts/create_hprc_segdup_tracks.py
@@ -0,0 +1,541 @@
+#!/usr/bin/env python3
+
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+import os
+import subprocess
+import configparser
+
+def parse_args(args=None):
+ """Parse command-line arguments for create_input_config.
+
+ Args:
+ args (list|None): Optional argument list for testing.
+
+ Returns:
+ argparse.Namespace: Parsed arguments
+ """
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ dest="input_bed",
+ type=str,
+ help="path to the bed file containing segdups, expect output from SEDEF",
+ )
+ parser.add_argument(
+ "--output_prefix",
+ dest="output_prefix",
+ type=str,
+ help="output file name prefix, all files will be named - .",
+ )
+ parser.add_argument(
+ "--extra_fields",
+ dest="extra_fields",
+ action="store_true",
+ help="add extra fields in the bed file"
+ )
+ parser.add_argument(
+ "--chrom_sizes_file",
+ dest="chrom_sizes_file",
+ type=str,
+ required=False,
+ help="file with chromomsome sizes, if not given the script will generate it.",
+ )
+ parser.add_argument(
+ "--synonym_file",
+ dest="synonym_file",
+ type=str,
+ required=False,
+ help="file with chromomsome synonym, if not given the script will generate it.",
+ )
+ parser.add_argument(
+ "-I",
+ "--ini_file",
+ dest="ini_file",
+ type=str,
+ required=False,
+ help="full path database configuration file, default - DEFAULT.ini in the same directory.",
+ )
+ parser.add_argument("--species", dest="species", type=str, help="species production name")
+ parser.add_argument("--assembly", dest="assembly", type=str, help="assembly default")
+
+ return parser.parse_args(args)
+
+def parse_ini(ini_file: str, section: str = "database") -> dict:
+ """
+ Load connection parameters from an INI file.
+
+ Raises:
+ SystemExit: If the requested *section* is absent from
+ the file (an error message is printed before exit).
+
+ Returns:
+ dict: Keys `host`, `port`, `user` – suitable for passing
+ straight to the MySQL command-line client.
+ """
+
+ config = configparser.ConfigParser()
+ config.read(ini_file)
+
+ if not section in config:
+ print(f"[ERROR] Could not find '{section}' config in ini file - {ini_file}")
+ exit(1)
+ else:
+ host = config[section]["host"]
+ port = config[section]["port"]
+ user = config[section]["user"]
+
+ return {"host": host, "port": port, "user": user}
+
+
+def get_db_name(server: dict, species: str, type: str) -> str:
+ """Return the first database name matching the species on the server.
+
+ Queries the MySQL server for databases matching the pattern and returns the
+ first match. A warning is printed if multiple matches are found.
+
+ Args:
+ server (dict): Server connection mapping with keys 'host', 'port', 'user'.
+ species (str): Species production name.
+ type (str): Database type (e.g. 'core').
+
+ Returns:
+ str: The first matching database name.
+ """
+ query = f"SHOW DATABASES LIKE '{species}_{type}%';"
+ process = subprocess.run(
+ [
+ "mysql",
+ "--host",
+ server["host"],
+ "--port",
+ server["port"],
+ "--user",
+ server["user"],
+ "-N",
+ "--execute",
+ query,
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ results = process.stdout.decode().strip().split("\n")
+ if len(results) > 1:
+ print(
+ f"[WARNING] Multiple {type} database found - returning the first match only"
+ )
+
+ return results[0]
+
+def generate_chrom_sizes_file(
+ server: dict,
+ core_db: str,
+ chrom_sizes_file: str,
+ assembly: str,
+) -> None:
+ """Generate a chromosome sizes file from the core database.
+
+ Writes seq_region lengths and synonym lengths with deduplication.
+
+ Args:
+ server (dict): Server connection mapping.
+ core_db (str): Core database name.
+ chrom_sizes_file (str): Output chrom sizes filename.
+ assembly (str): Assembly identifier used to filter coord_system.
+
+ Returns:
+ None
+ """
+ query = f"SELECT coord_system_id FROM coord_system WHERE version = '{assembly}';"
+ process = subprocess.run(
+ [
+ "mysql",
+ "--host",
+ server["host"],
+ "--port",
+ server["port"],
+ "--user",
+ server["user"],
+ "--database",
+ core_db,
+ "-N",
+ "--execute",
+ query,
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ coord_ids = (
+ "(" + ",".join([id for id in process.stdout.decode().strip().split("\n")]) + ")"
+ )
+
+ query = f"SELECT name, length FROM seq_region WHERE coord_system_id IN {coord_ids};"
+ process = subprocess.run(
+ [
+ "mysql",
+ "--host",
+ server["host"],
+ "--port",
+ server["port"],
+ "--user",
+ server["user"],
+ "--database",
+ core_db,
+ "-N",
+ "--execute",
+ query,
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ with open(chrom_sizes_file, "w") as file:
+ file.write(process.stdout.decode())
+
+ query = f"SELECT ss.synonym, s.length FROM seq_region AS s, seq_region_synonym AS ss WHERE s.seq_region_id = ss.seq_region_id AND s.coord_system_id IN {coord_ids};"
+ process = subprocess.run(
+ [
+ "mysql",
+ "--host",
+ server["host"],
+ "--port",
+ server["port"],
+ "--user",
+ server["user"],
+ "--database",
+ core_db,
+ "-N",
+ "--execute",
+ query,
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ with open(chrom_sizes_file, "a") as file:
+ file.write(process.stdout.decode().strip())
+
+ # remove duplicates
+ with open(chrom_sizes_file, "r") as file:
+ lines = file.readlines()
+
+ lengths = {}
+ for line in lines:
+ name, length = [col.strip() for col in line.split("\t")]
+ if name not in lengths or int(lengths[name]) < int(length):
+ lengths[name] = length
+
+ with open(chrom_sizes_file, "w") as file:
+ for name in lengths:
+ # we will keep length + 1 because bedToBigBed fails if it finds variant at boundary
+ length = int(lengths[name]) + 1
+ file.write(f"{name}\t{str(length)}\n")
+
+def generate_synonym_file(
+ server: dict, core_db: str, synonym_file: str
+) -> None:
+ """Generate a chromosome synonym file from the core database.
+
+ Queries seq_region and seq_region_synonym tables, post-processes to remove duplicates
+ and resolves names longer than 31 characters where possible.
+
+ Args:
+ server (dict): Server connection mapping.
+ core_db (str): Core database name.
+ synonym_file (str): Output file path.
+
+ Returns:
+ None
+ """
+ query = f"SELECT ss.synonym, sr.name FROM seq_region AS sr, seq_region_synonym AS ss WHERE sr.seq_region_id = ss.seq_region_id;"
+ process = subprocess.run(
+ [
+ "mysql",
+ "--host",
+ server["host"],
+ "--port",
+ server["port"],
+ "--user",
+ server["user"],
+ "--database",
+ core_db,
+ "-N",
+ "--execute",
+ query,
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+
+ with open(synonym_file, "w") as file:
+ file.write(process.stdout.decode().strip())
+
+ # remove duplicates and change seq region name that are longer than 31 character
+ with open(synonym_file, "r") as file:
+ lines = file.readlines()
+
+ names = {}
+ for line in lines:
+ synonym, name = [col.strip() for col in line.split("\t")]
+ if synonym not in names or len(names[synonym]) > len(name):
+ names[synonym] = name
+
+ new_names = {}
+ for synonym in names:
+ name = names[synonym]
+
+ # add entries for chr prefixed chromsome name from UCSC cases
+ if name.isdigit() or name in ["X", "Y", "MT"]:
+ new_names[f"chr{name}"] = name
+
+ # if name is longer than 31 character we try to take a synonym instead of the name
+ if len(name) > 31:
+ # if the current synonym is less than 31 character we can take it
+ # and it does not need to be in the file
+ if len(synonym) <= 31:
+ pass
+ # if the current synonym is longer than 31 character we look for another synonym of the name
+ else:
+ change_name = synonym
+ for alt_synonym in names:
+ if names[alt_synonym] == synonym and len(alt_synonym) < 31:
+ change_name = alt_synonym
+
+ if len(change_name) > 31:
+ print(
+ f"[WARNING] cannot resolve {name} to a synonym which is under 31 character"
+ )
+
+ new_names[synonym] = change_name
+ else:
+ new_names[synonym] = name
+
+ with open(synonym_file, "w") as file:
+ for synonym in new_names:
+ file.write(f"{synonym}\t{new_names[synonym]}\n")
+
+def main(args=None):
+ """Construct a bigBed and bigWig that can be used in Ensembl new website
+
+ Args:
+ args (list|None): Optional argument list for testing.
+
+ Returns:
+ None
+ """
+ args = parse_args(args)
+
+ input_bed = args.input_bed
+ output_prefix = args.output_prefix or os.path.basename(input_bed).replace(".bed", "_out")
+ extra_fields = args.extra_fields
+ chrom_sizes_file = args.chrom_sizes_file
+ synonym_file = args.synonym_file
+ species = args.species
+ assembly = args.assembly
+ ini_file = args.ini_file or "DEFAULT.ini"
+ output_bed = f"{output_prefix}.bed"
+ autosql = f"{output_prefix}.as"
+ output_bb = f"{output_prefix}.bb"
+ output_bedgraph = f"{output_prefix}.bedgraph"
+ output_bw = f"{output_prefix}.bw"
+
+ # generate chrom sizes file if not provided
+ if (not chrom_sizes_file) or (not os.path.isfile(chrom_sizes_file)):
+ print("No chrom sizes file provided, generating...")
+
+ if (not species) or (not assembly) or (not os.path.isfile(ini_file)):
+ print("[ERROR] Need --species, --assembly and --ini_file to generate chrom sizes file")
+ exit(1)
+
+ chrom_sizes_file = f"{species}.chrom_sizes"
+ core_server = parse_ini(ini_file, "core")
+ core_db = get_db_name(core_server, species, type="core")
+ generate_chrom_sizes_file(core_server, core_db, chrom_sizes_file, assembly)
+
+ chrom_sizes = {}
+ with open(chrom_sizes_file) as f:
+ for line in f:
+ [chrom, length] = line.strip().split("\t")
+ chrom_sizes[chrom] = length
+
+ # generate synonym file if not provided
+ if (not synonym_file) or (not os.path.isfile(synonym_file)):
+ print("No synonym file provided, generating...")
+
+ if (not species) or (not os.path.isfile(ini_file)):
+ print("[ERROR] Need --species and --ini_file to generate synonym file")
+ exit(1)
+
+ synonym_file = f"{species}.synonym"
+ generate_synonym_file(core_server, core_db, synonym_file)
+
+ synonyms = {}
+ with open(synonym_file) as file:
+ for line in file:
+ chr = line.split("\t")[0].strip()
+ synonym = line.split("\t")[1].strip()
+
+ synonyms[chr] = synonym
+
+ # format input bed to be ingestable by the web client
+ print("Generating bed ...")
+ header = []
+ avail_chrom = chrom_sizes.keys()
+ skipped_count = 0
+ skipped_chrom = set()
+ added_chrom = set()
+ with open(input_bed) as f, open(output_bed, "w") as w_f:
+ for line in f:
+ if line.startswith("#"):
+ header = line.strip().replace("#", "").split("\t")
+ continue
+
+ bed_fields = {}
+ for col_no, col_value in enumerate(line.strip().split("\t")):
+ # some cases there were duplicate start1, end1 fields containing invalid location
+ # not overriding the actual start1, end1
+ if not header[col_no] in bed_fields:
+ bed_fields[header[col_no]] = col_value
+
+ # remove chr prefix and rename M to MT; some have haplotype#version# prefix
+ bed_fields['chr1'] = bed_fields['chr1'].split("#")[-1]
+ if bed_fields['chr1'].startswith("chr"):
+ bed_fields['chr1'] = bed_fields['chr1'][3:]
+ if bed_fields['chr1'] == "M":
+ bed_fields['chr1'] = "MT"
+
+ # replace synonyms
+ bed_fields['chr1'] = synonyms.get(bed_fields['chr1'], bed_fields['chr1'])
+
+ if bed_fields['chr1'] not in avail_chrom:
+ skipped_chrom.add(bed_fields['chr1'])
+ skipped_count += 1
+ continue
+ added_chrom.add(bed_fields['chr1'])
+
+ segdup_type = []
+ if bed_fields['telo'] == '1':
+ segdup_type.append("Telomeric")
+ elif bed_fields['peri'] == '1':
+ segdup_type.append("Pericentromeric")
+ elif bed_fields['acro'] == '1':
+ segdup_type.append("Acrocentric")
+
+ new_bed_fields = {
+ "chrom": bed_fields['chr1'],
+ "chromStart": bed_fields['start1'],
+ "chromEnd": bed_fields['end1'],
+ "strand": bed_fields['strand1'],
+ "analysis": "SEDEF",
+ "repeatName": "segdup",
+ "repeatClass": "segmental_duplication",
+ "repeatType": "/".join(segdup_type)
+ }
+ if extra_fields:
+ new_bed_fields["percentage_match"] = bed_fields['fracMatch']
+ new_bed_fields["duplicated_region"] = bed_fields['name']
+ new_bed_fields["duplicated_region_strand"] = bed_fields['strand2']
+
+ w_f.write("\t".join(new_bed_fields.values()) + "\n")
+ process = subprocess.run(
+ ["sort", "-k1,1", "-k2,2n", "-k3,3n", "-o", output_bed, output_bed],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ if process.returncode != 0:
+ print(f"[ERROR] Could not sort bed - {process.stderr.decode()}")
+ exit(1)
+
+ print(f"Skipped {skipped_count} entries from chromosomes - {','.join(skipped_chrom)}")
+
+ # create AutoSQL for bigBed
+ AutoSQL = ['table repeats',
+ '"Repeat feature data."',
+ "\t(",
+ '\tstring chrom; "Region name (Chromosome or contig, scaffold, etc.)"',
+ '\tuint chromStart; "Start position in region"',
+ '\tuint chromEnd; "End position in region"\n',
+ '\tchar[1] strand; "Strand as +/-/. (forward, reverse, undefined)"',
+ '\tstring analysis; "Type of repeat analysis"',
+ '\tstring repeatName; "Name of repeat"',
+ '\tstring repeatClass; "Class of repeat"',
+ '\tstring repeatType; "Type of repeat"',
+ "\t)",
+ ]
+ if extra_fields:
+ AutoSQL.pop()
+ AutoSQL.append('\tstring match; "Match percentage with duplicated region"')
+ AutoSQL.append('\tstring duplicated_region; "Duplicated region (formatted as chrom:start-end)"')
+ AutoSQL.append('\tstring duplicated_region_strand; "Strand of duplicated region"')
+ AutoSQL.append('\t)')
+
+ with open(autosql, "w") as w_f:
+ for line in AutoSQL:
+ w_f.write(line + "\n")
+
+ # generate bigBed
+ print("Generating bigBed ...")
+ type_arg = "-type=bed3+5"
+ if extra_fields:
+ type_arg = "-type=bed3+8"
+ process = subprocess.run(
+ ["bedToBigBed", "-tab", type_arg, f"-as={autosql}", output_bed, chrom_sizes_file, output_bb],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ if process.returncode != 0:
+ print(f"[ERROR] Could not create bigBed - {process.stderr.decode()}")
+ exit(1)
+
+ # generate bigWig
+ print("Generating bigWig ...")
+ with open(output_bed) as f, open(output_bedgraph, "w") as w_f:
+ current_chrom = ""
+ pos = 1
+ for line in f:
+ (chrom, start, end) = line.strip().split("\t")[0:3]
+ # bed is 0-indexed and wiggle is 1-indexed, so we need to add 1 here
+ start = int(start) + 1
+ # bed is end-inclusive and wiggle is end-inclusive, so no need to add 1 here
+ end = int(end)
+
+ if chrom != current_chrom:
+ current_chrom = chrom
+ pos = 1
+
+ if pos < start:
+ w_f.write(f"{current_chrom}\t{pos}\t{start-1}\t0.0\n")
+ w_f.write(f"{current_chrom}\t{start}\t{end}\t1.0\n")
+ elif pos < end:
+ w_f.write(f"{current_chrom}\t{pos}\t{end}\t1.0\n")
+ else:
+ continue
+ pos = end+1
+
+ process = subprocess.run(
+ ["bedGraphToBigWig", output_bedgraph, chrom_sizes_file, output_bw],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ if process.returncode != 0:
+ print(f"[ERROR] Could not create bigWig - {process.stderr.decode()}")
+ exit(1)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/create_metadata_payload.py b/scripts/create_metadata_payload.py
index e155ad1f..85cda1f2 100755
--- a/scripts/create_metadata_payload.py
+++ b/scripts/create_metadata_payload.py
@@ -143,12 +143,13 @@ def get_variant_example(file: str, species: str) -> str:
# if human, try to find rs699 in 400kbp range
if species.startswith("homo_sapiens"):
- for variant in vcf("1:230500000-230900000"):
- if variant.ID == "rs699":
- chrom = variant.CHROM
- pos = variant.POS
- id = variant.ID
- return f"{chrom}:{pos}:{id}"
+ if '1' in vcf.seqnames:
+ for variant in vcf("1:230500000-230900000"):
+ if variant.ID == "rs699":
+ chrom = variant.CHROM
+ pos = variant.POS
+ id = variant.ID
+ return f"{chrom}:{pos}:{id}"
# find a missense_variant
for variant in vcf:
diff --git a/scripts/create_track_api_metadata.py b/scripts/create_track_api_metadata.py
index 5f669071..f66509b5 100755
--- a/scripts/create_track_api_metadata.py
+++ b/scripts/create_track_api_metadata.py
@@ -150,7 +150,7 @@ def get_source_desc_prefix(source: str, source_version: str) -> str:
else:
desc_prefix = f" from {source}"
if source_version is not None:
- desc_prefix += f"- {source_version}"
+ desc_prefix += f" - {source_version}"
return desc_prefix
diff --git a/scripts/merge_region_call.py b/scripts/merge_region_call.py
new file mode 100644
index 00000000..67a0ed92
--- /dev/null
+++ b/scripts/merge_region_call.py
@@ -0,0 +1,129 @@
+import click
+import gzip
+import pysam
+import re
+from collections import defaultdict
+from uu import Error
+from lxml import etree
+import xml.etree.ElementTree as ET
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+acession_copy_number_map = {}
+
+
+def find_variant_by_accession(xml_file, accession) -> dict|None:
+ if not acession_copy_number_map:
+ tree = etree.parse(xml_file)
+ root = tree.getroot()
+ for vc in root.findall("VARIANT_CALL"):
+ accession_from_file = vc.get("variant_call_accession")
+ copy_num = vc.get("copy_number")
+ acession_copy_number_map[accession_from_file] = copy_num
+ if accession in acession_copy_number_map:
+ return {
+ "variant_call_accession": accession,
+ "copy_number": acession_copy_number_map[accession]
+ }
+
+
+
+def get_calls_by_region(call_vcf, call_xml_path) -> dict:
+ region_calls = defaultdict(list)
+ for rec in call_vcf:
+ parent_id = rec.info.get("REGIONID")[0]
+ call_record = {}
+ if parent_id:
+ xml_record = find_variant_by_accession(call_xml_path, rec.id)
+ if xml_record:
+ call_record["COPY_NUMBER"] = xml_record["copy_number"]
+ call_record["CHROM"] = rec.chrom
+ call_record["POS"] = rec.pos
+ call_record["REF"] = rec.ref
+ if len(rec.alts) != 1:
+ logging.warning(f"Not bi-allelic variant call - {rec.id or f'{rec.chrom}:{rec.pos}'}")
+ call_record["ALT"] = rec.alts[0] if rec.alts else None
+ call_record["ALLELE_NAME"] = rec.id or f"{rec.chrom}:{rec.pos}"
+ call_record["ALLELE_TYPE"] = rec.info.get("SVTYPE")
+ call_record["END"] = rec.stop
+ call_record["SVLEN"] = rec.info.get("SVLEN")
+ region_calls[parent_id].append(call_record)
+ return region_calls
+
+
+# Add new INFO fields to header
+def get_output_header(header: pysam.VariantHeader)-> pysam.VariantHeader:
+ header.info.add("ALLELE_NAME", ".", "String", "Comma-separated list of supporting call IDs")
+ header.info.add("ALLELE_TYPE", 1, "String", "Aggregated type of supporting calls")
+ header.info.add("CN", ".", "String", "Comma-separated list of copy numbers of supporting calls")
+ if "SVLEN" not in header.info:
+ header.info.add("SVLEN", ".", "Integer", "List of SV lengths of supporting calls")
+ return header
+
+def aggregate_sv_type(sv_types)-> str:
+ sv_types = set(sv_types)
+ if len(sv_types) == 1:
+ return sv_types.pop()
+ elif "DUP" in sv_types and "INS" in sv_types and len(sv_types) == 2:
+ return "DUP/INS"
+ elif "DEL" in sv_types and "INS" in sv_types and len(sv_types) == 2:
+ return "DEL/INS"
+ else:
+ return "COMPLEX"
+
+# to validate calls for CHROM, POS, REF
+def validate_calls(region_rec, calls) -> bool:
+ for field in (("CHROM","chrom"), ("POS","pos"), ("REF","ref")):
+ if len(set([call[field[0]] for call in calls])) != 1 or set([call[field[0]] for call in calls]).pop() != getattr(region_rec,field[1]):
+ logging.warning(f"{field[0]} does not match for region and call files")
+ return False
+ return True
+
+def generate_output(region_vcf, region_calls, out_vcf_path, header) -> None:
+ # Output VCF
+ out_vcf = pysam.VariantFile(out_vcf_path, "w", header=header)
+ for rec in region_vcf:
+ new_rec = out_vcf.new_record(contig=rec.contig, start=rec.start, stop=rec.stop,
+ alleles=rec.alleles, id=rec.id,
+ qual=None, filter=None, info=rec.info
+ )
+ calls = region_calls.get(rec.id, [])
+
+ if calls:
+ new_rec.info["ALLELE_NAME"] = ",".join(call["ALLELE_NAME"] for call in calls)
+ new_rec.info["ALLELE_TYPE"] = aggregate_sv_type(call["ALLELE_TYPE"] for call in calls)
+ new_rec.alts = tuple([call["ALT"] for call in calls])
+ new_rec.info["SVLEN"] = ",".join(call["SVLEN"][0] if call["SVLEN"] is not None else "." for call in calls)
+ if any(call.get("COPY_NUMBER") is not None for call in calls):
+ new_rec.info["CN"] = ",".join(call.get("COPY_NUMBER", "NA") for call in calls)
+ calls_max_stop = max(call["END"] for call in calls)
+ if rec.stop != calls_max_stop:
+ logging.warning(f"END of calls does not match with region ({rec.stop} vs. {calls_max_stop}) for variant region - {rec.id}:{rec.contig}:{rec.start}")
+ else:
+ new_rec.info["ALLELE_NAME"] = None
+ new_rec.info["ALLELE_TYPE"] = rec.info.get("SVTYPE")
+ out_vcf.write(new_rec)
+
+ out_vcf.close()
+
+@click.command()
+@click.argument('region_vcf_path', type=click.Path(exists=True))
+@click.argument('call_vcf_path', type=click.Path(exists=True))
+@click.argument('call_xml_path', type=click.Path(exists=True))
+@click.argument('output_vcf_path', type=click.Path())
+def main(region_vcf_path, call_vcf_path, call_xml_path, output_vcf_path) -> None:
+ # Step 1: Load call VCF and map calls to regions
+ call_vcf = pysam.VariantFile(call_vcf_path)
+ calls = get_calls_by_region(call_vcf, call_xml_path)
+
+ # Step 2: Load region VCF and add header fields
+ region_vcf = pysam.VariantFile(region_vcf_path)
+ header = region_vcf.header.copy()
+
+ # Step 3: Generate output VCF
+ output_header = get_output_header(header)
+ generate_output(region_vcf, calls, output_vcf_path, output_header)
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/population_to_haplotype.py b/scripts/population_to_haplotype.py
index e08bf2eb..55833dac 100644
--- a/scripts/population_to_haplotype.py
+++ b/scripts/population_to_haplotype.py
@@ -18,7 +18,10 @@
import os
import argparse
import subprocess
-from enum import Enum
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
parser = argparse.ArgumentParser()
parser.add_argument(dest="input", type=str, help="Population VCF file path")
@@ -32,6 +35,12 @@
action="store_true",
help="Split the output file into short and structural variant",
)
+parser.add_argument(
+ "--update_id",
+ dest="update_id",
+ action="store_true",
+ help="Update variant identifier to be a modified SPDI notation",
+)
parser.add_argument("--debug", dest="debug", action="store_true", help="Debug mode")
args = parser.parse_args()
@@ -47,6 +56,8 @@ def get_sample_genotype_prefix(sample):
"""
if sample == "GRCh38":
return {0: "GCA_000001405.29"}
+ elif sample == "CHM13":
+ return {0: "GCA_009914755.4"}
else:
return {0: "paternal", 1: "maternal"}
@@ -100,7 +111,7 @@ def bgzip_file(file: str) -> bool:
if not os.path.isfile(file):
raise FileNotFoundError(f"File not found - {file}")
- process = subprocess.run(["bgzip", file])
+ process = subprocess.run(["bgzip", "-f", file])
if process.returncode != 0:
raise Exception("Failed to bgzip - {file}")
@@ -128,10 +139,19 @@ def index_file(file: str) -> bool:
STRUCTURAL_VARIANT_LEN = 50
sample = args.sample
+
input_vcf = VCF(args.input, samples=[sample])
+if args.update_id:
+ input_vcf.add_info_to_header({
+ 'ID': 'NODEID',
+ 'Description': 'Identifier of the nodes this variant belong to',
+ 'Type':'String',
+ 'Number': '1'
+ })
+
out_dir = (
args.output_dir
- or "/nfs/production/flicek/ensembl/variation/new_website/SV/process_hgvs3/outputs/"
+ or os.getcwd()
)
filenames = generate_filenames(sample, args.split_sv)
writers = {}
@@ -148,23 +168,76 @@ def index_file(file: str) -> bool:
else:
writers["structural_variant"] = writers["short_variant"]
+
sample_idx = input_vcf.samples.index(sample)
counter = 100
+cache = {
+ "location": ""
+}
for variant in input_vcf:
- # id_tag = variant.INFO.get("ID")
- # var_type = id_tag.split("-")[2]
+ orig_alt = variant.ALT
+ orig_id = variant.ID
max_allele_len = max([len(allele) for allele in variant.ALT + [variant.REF]])
type = "short_variant"
if max_allele_len > STRUCTURAL_VARIANT_LEN:
type = "structural_variant"
-
+
genotype = variant.genotypes[sample_idx]
for gt_idx, gt in enumerate(genotype[:-1]):
- if gt:
+ if gt > 0:
+ ref = variant.REF
+ alt = variant.ALT[gt-1] # gt=0 is ref
+ location = f"{variant.CHROM}:{variant.POS}"
+ allele_string = f"{ref.upper()}/{alt.upper()}"
+
+ # remove anchor and calculate modified spdi
+ if ref[0] == alt[0]:
+ ref = ref[1:]
+ alt = alt[1:]
+ deleted_length = len(ref) if len(ref) else ""
+ inserted_length = len(alt) if len(alt) else ""
+ mod_spdi = f"{variant.CHROM}:{variant.POS}:{deleted_length}:{inserted_length}"
+
+ # for new location initialise the lookup cache
+ if cache["location"] != location:
+ cache = {
+ "location": location
+ }
+ cache[mod_spdi] = cache.get(mod_spdi, {})
+
+ # checking variants with same SPDI
+ if gt_idx in cache[mod_spdi]:
+
+ # if same SPDI have different allele we need to check why that is so
+ if allele_string != cache[mod_spdi][gt_idx]:
+ logger.warning("Same SPDI with different allele string")
+ logger.warning(f"\tlocation - {location}; spdi - {mod_spdi}")
+ logger.warning(f"\tallele_string 1: {cache[mod_spdi][gt_idx]}")
+ logger.warning(f"\tallele_string 2: {allele_string}")
+
+ # same SPDI with same allele string is basically same variant
+ # it sometimes happen when the different path in graph gives same variant
+ # take only one and ignore the others
+ else:
+ logger.info(f"Duplicate variant - {mod_spdi}, only one will be taken")
+ break
+
+ cache[mod_spdi][gt_idx] = allele_string
+
+ # update variant and store
+ if args.update_id:
+ variant.INFO["NODEID"] = orig_id
+ variant.ID = mod_spdi
+ variant.ALT = [variant.ALT[gt-1]]
+
writer = writers[type][gt_idx]
writer.write_record(variant)
+ # restore the values for next gt
+ variant.ID = orig_id
+ variant.ALT = orig_alt
+
if args.debug and not counter:
break
counter -= 1
diff --git a/scripts/test_handover.py b/scripts/test_handover.py
new file mode 100644
index 00000000..277f09b4
--- /dev/null
+++ b/scripts/test_handover.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import requests
+import sys
+import os
+import json
+import argparse
+from glob import glob
+
+SITE_URL = {
+ "live": "https://beta.ensembl.org",
+ "staging": "https://staging-2020.ensembl.org"
+}
+
+def parser_args(args):
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--payload_json", dest="payload_json", help="path to handover payload json file - either for api or tracks")
+ parser.add_argument("--site", dest="site", choices=["live", "staging"], default="live")
+ parser.add_argument("--type", dest="type", choices=["tracks", "api"], default="tracks")
+ parser.add_argument("--quiet", dest="quiet", action="store_true", help="Only report errors")
+
+ return parser.parse_args(args)
+
+def fetch_data(url):
+ """Fetch JSON data from a URL using HTTP GET.
+
+ Args:
+ url (str): URL to request.
+
+ Returns:
+ dict|list|None: Parsed JSON response if the request and parsing succeed,
+ otherwise None on any RequestException.
+ """
+ try:
+ response = requests.get(url)
+ response.raise_for_status()
+ return response.json()
+ except requests.exceptions.RequestException as e:
+ print(f"Error fetching data: {e}")
+ return None
+
+def check_tracks_api_loading(track_payload, site, quiet=False):
+ print("Checking loading of track api ...")
+
+ track_cat_url = SITE_URL[site] + "/api/tracks/track_categories/"
+ track_url = SITE_URL[site] + "/api/tracks/track/"
+
+ error_msg = {}
+ with open(track_payload, "r") as f:
+ track_metadata = json.load(f)
+ for genome_uuid in track_metadata:
+ label = track_metadata[genome_uuid]["label"]
+ description = track_metadata[genome_uuid]["description"]
+ bigbed = os.path.basename(
+ track_metadata[genome_uuid]["datafiles"]["details"]
+ )
+ bigwig = os.path.basename(
+ track_metadata[genome_uuid]["datafiles"]["summary"]
+ )
+
+ error_msg[genome_uuid] = {
+ "msgs": [],
+ "tracks": {}
+ }
+ url = f"{track_cat_url}/{genome_uuid}"
+ data = fetch_data(url)
+ if data:
+ for category in data["track_categories"]:
+ if category["label"] != "Short variants by resource":
+ continue
+
+ if category["track_category_id"] != "short-variants":
+ msg = "\tCategory id -\n"
+ msg += "\t GOT: {category['track_category_id']}\n"
+ msg += "\tEXPECTED: short-variants\n"
+ error_msg[genome_uuid]['msgs'].append(msg)
+
+ if category["type"] != "Variation":
+ msg = f"\tCategory type -\n"
+ msg += "\t GOT: {category['type']}\n"
+ msg += "\tEXPECTED: Variation\n"
+ error_msg[genome_uuid]['msgs'].append(msg)
+
+ if len(category["track_list"]) > 1:
+ msg = f"\tNumber of tracks -\n"
+ msg += f"\t GOT: {len(category['track_list'])} tracks\n"
+ msg += f"\tEXPECTED: 1 track\n"
+ error_msg[genome_uuid]['msgs'].append(msg)
+
+ for track in category["track_list"]:
+ error_msg[genome_uuid]["tracks"][track['track_id']] = []
+ if track["type"] != "variant":
+ msg = f"\tTrack type -\n"
+ msg += f"\t GOT: {track['type']}\n"
+ msg += f"\tEXPECTED: variant\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']].append(msg)
+
+ if track["label"] != label:
+ msg = f"\tTrack label -\n"
+ msg += f"\t GOT: {track['label']}\n"
+ msg += f"\tEXPECTED: {label}\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']].append(msg)
+
+ if track["description"] != description:
+ msg = f"\tTrack description -\n"
+ msg += f"\t GOT: {track['description']}\n"
+ msg += f"\tEXPECTED: {description}\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']].append(msg)
+
+ url = f"{track_url}/{track['track_id']}"
+ data = fetch_data(url)
+ if data:
+ if data["label"] != label:
+ msg = f"\t(/track endpoint) Track label-\n"
+ msg += f"\t GOT: {data['label']}\n"
+ msg += f"\tEXPECTED: {label}\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']]
+
+ if data["datafiles"]["variant-details"] != bigbed:
+ msg = "\tbiBed file -\n"
+ msg += f"\t GOT: {data['datafiles']['variant-details']}\n"
+ msg += f"\tEXPECTED: {bigbed}\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']]
+
+ if data["datafiles"]["variant-summary"] != bigwig:
+ msg = "\tbiWig file-\n"
+ msg += f"\t GOT: {data['datafiles']['variant-summary']}\n"
+ msg += f"\tEXPECTED: {bigwig}\n"
+ error_msg[genome_uuid]["tracks"][track['track_id']]
+ else:
+ msg = "No track information found."
+ error_msg[genome_uuid]["tracks"][track['track_id']].append(msg)
+ else:
+ msg = "No track category information found."
+ error_msg[genome_uuid]['msgs'].append(msg)
+
+ for genome_uuid in error_msg:
+ any_error = False
+ if error_msg[genome_uuid]["msgs"]:
+ any_error = True
+ for track in error_msg[genome_uuid]["tracks"]:
+ if error_msg[genome_uuid]["tracks"][track]:
+ any_error = True
+
+ if any_error:
+ print(f"genome: {genome_uuid}")
+ for msg in error_msg[genome_uuid]["msgs"]:
+ print(msg)
+ any_error = True
+ for track in error_msg[genome_uuid]["tracks"]:
+ print(f"track: {track}")
+ for msg in error_msg[genome_uuid]["tracks"][track]:
+ print(msg)
+ any_error = True
+ if not any_error and not quiet:
+ print(f"genome: {genome_uuid}")
+ print("All OK.\n")
+
+def check_track_files_copy(track_payload, site, quiet=False):
+ print("Checking copy of track files ...")
+
+ target_dir = os.environ.get('GB_DIR', None)
+ if target_dir is None:
+ raise EnvironmentError("Set GB_DIR to where genome browser files are located and try again...")
+
+ if site == "live" and "/staging/" in target_dir:
+ print(f"live site selected, updating gb directory...")
+ target_dir = target_dir.replace("staging", "live")
+ if site == "staging" and "/live/" in target_dir:
+ print(f"staging site selected, updating gb directory...")
+ target_dir = target_dir.replace("live", "staging")
+
+ error_msg = {}
+ with open(track_payload, "r") as f:
+ track_metadata = json.load(f)
+ for genome_uuid in track_metadata:
+ any_error = False
+ error_msg[genome_uuid] = []
+ datafiles = track_metadata[genome_uuid]['datafiles']
+ variant_files = glob(target_dir + f"/{genome_uuid}/variant*")
+
+ for type in datafiles:
+ file_name = os.path.basename(datafiles[type])
+ target_file_path = os.path.join(target_dir, genome_uuid, file_name)
+
+ if not os.path.isfile(target_file_path):
+ any_error = True
+ error_msg[genome_uuid].append(f"\tNOT FOUND: {file_name}")
+
+
+ if len(variant_files) > len(datafiles):
+ any_error = True
+
+ variant_filenames = [os.path.basename(file) for file in variant_files]
+ data_filenames = [os.path.basename(datafiles[type]) for type in datafiles]
+ excess_filenames = set(variant_filenames) - set(data_filenames)
+
+ error_msg[genome_uuid].append("\n\tExcess variant track files in gb directory -")
+ for file in excess_filenames:
+ error_msg[genome_uuid].append(f"\t{ file }")
+
+ if any_error:
+ print(f"genome: {genome_uuid}")
+ for msg in error_msg[genome_uuid]:
+ print(msg)
+ if not any_error and not quiet:
+ print(f"genome: {genome_uuid}")
+ print("All OK.")
+
+def check_api_files_copy(api_payload, site, quiet=False):
+ print("Checking copy of api files ...")
+
+ target_dir = os.environ.get('GB_DIR', None)
+ if target_dir is None:
+ raise EnvironmentError("Set GB_DIR to where genome browser files are located and try again...")
+
+ if site == "live" and "/staging/" in target_dir:
+ print(f"live site selected, updating gb directory...")
+ target_dir = target_dir.replace("staging", "live")
+ if site == "staging" and "/live/" in target_dir:
+ print(f"staging site selected, updating gb directory...")
+ target_dir = target_dir.replace("live", "staging")
+
+ error_msg = {}
+ with open(api_payload, "r") as f:
+ api_metadata = json.load(f)
+ for dataset in api_metadata:
+ any_error = False
+ genome_uuid = dataset['genome_uuid']
+ error_msg[genome_uuid] = []
+ variant_files = glob(target_dir + f"/{genome_uuid}/variation.vcf.gz*")
+
+ file_name = "variation.vcf.gz"
+ target_file_path = os.path.join(target_dir, genome_uuid, file_name)
+ if not os.path.isfile(target_file_path):
+ any_error = True
+ error_msg[genome_uuid].append(f"\tNOT FOUND: {file_name}")
+
+ if len(variant_files) > 2:
+ any_error = True
+ error_msg[genome_uuid].append(f"\n\tExcess files; file count {len(variant_files)}, expected 2.")
+
+ if any_error:
+ print(f"genome: {genome_uuid}")
+ for msg in error_msg[genome_uuid]:
+ print(msg)
+ if not any_error and not quiet:
+ print(f"genome: {genome_uuid}")
+ print("All OK.")
+
+def main(args=None):
+ """Validate track API endpoints against a local track metadata file.
+
+ Reads a track metadata JSON file, queries the remote /track_categories and /track
+ API endpoints for each genome UUID and compares returned values (label, description,
+ datafile names) with the expected values from the metadata file. Prints mismatches.
+
+ Args:
+ argv (list): Command-line argument vector; argv[1] should be the path to the
+ track metadata JSON file.
+
+ Raises:
+ FileNotFoundError: If the provided track metadata file does not exist.
+ """
+
+ args = parser_args(args)
+ payload_json = args.payload_json
+ site = args.site
+ type = args.type
+ quiet = args.quiet
+
+ if not os.path.isfile(payload_json):
+ raise FileNotFoundError(
+ f"No such file - {payload_json}, please provide correct payload file"
+ )
+
+ if type == "tracks":
+ check_tracks_api_loading(payload_json, site, quiet)
+ check_track_files_copy(payload_json, site, quiet)
+ if type == "api":
+ check_api_files_copy(payload_json, site, quiet)
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/test_track_api_endpoint.py b/scripts/test_track_api_endpoint.py
deleted file mode 100644
index 7bcf52e4..00000000
--- a/scripts/test_track_api_endpoint.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-
-# See the NOTICE file distributed with this work for additional information
-# regarding copyright ownership.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import requests
-import sys
-import os
-import json
-
-
-def fetch_data(url):
- """Fetch JSON data from a URL using HTTP GET.
-
- Args:
- url (str): URL to request.
-
- Returns:
- dict|list|None: Parsed JSON response if the request and parsing succeed,
- otherwise None on any RequestException.
- """
- try:
- response = requests.get(url)
- response.raise_for_status()
- return response.json()
- except requests.exceptions.RequestException as e:
- print(f"Error fetching data: {e}")
- return None
-
-
-def main(argv):
- """Validate track API endpoints against a local track metadata file.
-
- Reads a track metadata JSON file, queries the remote /track_categories and /track
- API endpoints for each genome UUID and compares returned values (label, description,
- datafile names) with the expected values from the metadata file. Prints mismatches.
-
- Args:
- argv (list): Command-line argument vector; argv[1] should be the path to the
- track metadata JSON file.
-
- Raises:
- FileNotFoundError: If the provided track metadata file does not exist.
- """
- track_metadata_file = argv[1]
- if not os.path.isfile(track_metadata_file):
- raise FileNotFoundError(
- f"No such file - {track_metadata_file}, please provide correct track metadata file"
- )
-
- with open(track_metadata_file, "r") as f:
- track_metadata = json.load(f)
- for genome_uuid in track_metadata:
- label = track_metadata[genome_uuid]["label"]
- description = track_metadata[genome_uuid]["description"]
- bigbed = os.path.basename(
- track_metadata[genome_uuid]["datafiles"]["details"]
- )
- bigwig = os.path.basename(
- track_metadata[genome_uuid]["datafiles"]["summary"]
- )
-
- print(f"Genome UUID: {genome_uuid}")
- url = f"https://staging-2020.ensembl.org/api/tracks/track_categories/{genome_uuid}"
- data = fetch_data(url)
- if data:
- for category in data["track_categories"]:
- if category["label"] != "Short variants by resource":
- continue
-
- print("from /track_categories endpoint")
- if category["track_category_id"] != "short-variants":
- print(f"Category id: {category['track_category_id']}")
- print("\tEXPECTED: short-variants")
- if category["type"] != "Variation":
- print(f"Category type: {category['type']}")
- print("\tEXPECTED: Variation")
-
- if len(category["track_list"]) > 1:
- print(
- f"EXPECTED: single tracks, but {len(category['track_list'])} tracks found"
- )
-
- for track in category["track_list"]:
- print(f"Track id: {track['track_id']}")
- if track["type"] != "variant":
- print(f"Track type: {track['type']}")
- print("\tEXPECTED: variant")
-
- if track["label"] != label:
- print(f"Track label: {track['label']}")
- print(f"\tEXPECTED: {label}")
-
- if track["description"] != description:
- print(f"Track description: {track['description']}")
- print(f"\tEXPECTED: {description}")
-
- url = f"https://staging-2020.ensembl.org/api/tracks/track/{track['track_id']}"
- data = fetch_data(url)
-
- if data:
- print("from /track endpoint:")
- if data["label"] != label:
- print(f"label: {data['label']}")
- print(f"\tEXPECTED: {label}")
- if data["datafiles"]["variant-details"] != bigbed:
- print(f"biBed file: {data['datafiles']['variant-details']}")
- print(f"\tEXPECTED: {bigbed}")
- if data["datafiles"]["variant-summary"] != bigwig:
- print(f"biWig file: {data['datafiles']['variant-summary']}")
- print(f"\tEXPECTED: {bigwig}")
- else:
- print("No track categories retrieved.")
-
- else:
- print("No track information retrieved.")
- print()
-
-
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/scripts/update_variant_id.py b/scripts/update_variant_id.py
new file mode 100644
index 00000000..ca3e49bf
--- /dev/null
+++ b/scripts/update_variant_id.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cyvcf2 import VCF, Writer
+import sys
+import os
+import argparse
+import json
+import subprocess
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+def parse_args(args=None):
+ """Parse command-line arguments for creating track API metadata.
+
+ Args:
+ args (list|None): Argument list for testing. If None, argparse reads from sys.argv.
+
+ Returns:
+ argparse.Namespace: Parsed arguments, including tracks_outdir and input_config.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--input_config",
+ dest="input_config",
+ type=str,
+ required=True,
+ help="input_config json file used in vcf_prepper",
+ )
+ parser.add_argument(
+ "--dir",
+ dest="pipeline_outdir",
+ type=str,
+ help="path to a vcf prepper output directory",
+ )
+ parser.add_argument(
+ "--keep_node_id",
+ dest="keep_node_id",
+ action="store_true",
+ help="keep node identifier in INFO/NODEID field",
+ )
+
+ return parser.parse_args(args)
+
+def main(args=None):
+ args = parse_args(args)
+
+ input_config = args.input_config
+ pipeline_outdir = args.pipeline_outdir or os.getcwd()
+
+ if input_config is None or not os.path.isfile(input_config):
+ logger.error(f"Please provide input config to proceed")
+ exit(1)
+
+ with open(input_config, "r") as file:
+ species_metadata = json.load(file)
+
+ for genome in species_metadata:
+ for input_config in species_metadata[genome]:
+ genome_uuid = input_config["genome_uuid"]
+ source = input_config["source_name"]
+ logger.info(f"Running for: {source}")
+
+ # update ID in VCF
+ logger.info("Processing VCF...")
+
+ input_file = os.path.join(pipeline_outdir, "api", genome_uuid, f"variation_{source}.vcf.gz")
+ output_file = input_file.replace(".vcf", "_renamed.vcf")
+
+ input_vcf = VCF(input_file)
+ if args.keep_node_id:
+ input_vcf.add_info_to_header({
+ 'ID': 'NODEID',
+ 'Description': 'Identifier of the nodes this variant belong to',
+ 'Type':'String',
+ 'Number': '1'
+ })
+ output_vcf = Writer(output_file, input_vcf, mode="wz")
+
+ csq_header_info = input_vcf.get_header_type("CSQ")["Description"]
+ spdi_idx = csq_header_info.split("Format: ")[-1].split("|").index("SPDI")
+
+ unique_ids = {}
+ for variant in input_vcf:
+ spdi = variant.INFO.get('CSQ').split(",")[0].split("|")[spdi_idx]
+
+ (chr, pos, deleted, inserted) = spdi.split(":")
+ deleted_length = int(deleted) if deleted.isdecimal() else len(deleted)
+ deleted_length = deleted_length if deleted_length != 0 else ""
+ inserted_length = int(inserted) if inserted.isdecimal() else len(inserted)
+ inserted_length = inserted_length if inserted_length != 0 else ""
+ new_spdi = f"{chr}:{pos}:{deleted_length}:{inserted_length}"
+
+ identifier = f"{variant.CHROM}:{variant.POS}:{variant.REF}:{','.join(variant.ALT)}"
+ if identifier in unique_ids:
+ logger.error(f"Identifier clash for {identifier}: {unique_ids[identifier]} vs {new_spdi}")
+ logger.error("Cannot proceed...")
+ exit(1)
+ unique_ids[identifier] = new_spdi
+
+ if args.keep_node_id:
+ variant.INFO['NODEID'] = variant.ID
+ variant.ID = new_spdi
+
+ output_vcf.write_record(variant)
+
+ input_vcf.close()
+ output_vcf.close()
+
+ # index the file
+ process = subprocess.run([
+ "tabix", "-C", "-p", "vcf", output_file
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ if process.returncode != 0:
+ logger.error("Failed to index output - ", process.stderr.decode().strip())
+ exit(1)
+
+ # update ID in bigBed
+ logger.info("Processing bigBed...")
+
+ input_bb = os.path.join(pipeline_outdir, "tracks", genome_uuid, f"variant-{source.lower()}-details.bb")
+ bed_file = input_bb.replace(".bb", ".bed")
+
+ # convert bb to bed file
+ process = subprocess.run([
+ "bigBedToBed", input_bb, bed_file
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ if process.returncode != 0:
+ logger.error("Failed to convert bb to bed - ", process.stderr.decode().strip())
+ exit(1)
+
+ # update ID fields in bed file
+ tmp_bed = bed_file + ".tmp"
+ field_num = 9
+ with open(bed_file, "r") as r_f, open(tmp_bed, "w") as w_f:
+ for line in r_f:
+ fields = line.split("\t")
+ if fields[4] == "insertion":
+ identifier = f"{fields[0]}:{int(fields[1])-1}:{fields[5]}:{fields[6]}"
+ else:
+ identifier = f"{fields[0]}:{int(fields[1])+1}:{fields[5]}:{fields[6]}"
+ if identifier not in unique_ids:
+ logger.error(f"Identifier lookup failed for - {identifier}")
+ exit(1)
+ fields[3] = unique_ids[identifier]
+ field_num = len(fields)
+
+ w_f.write("\t".join(fields))
+
+ # convert the updated bed to bb
+ tmp_bb = input_bb + ".tmp"
+ chrom_sizes = os.path.join(pipeline_outdir, "tmp", genome_uuid, f"{genome}.chrom.sizes")
+ process = subprocess.run([
+ "bedToBigBed", f"-type=bed3+{field_num-3}", tmp_bed, chrom_sizes, tmp_bb
+ ],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ if process.returncode != 0:
+ logger.error("Failed to convert bed to bb - ", process.stderr.decode().strip())
+ exit(1)
+
+ # replace current vcf and bb file
+ os.remove(input_bb)
+ os.rename(tmp_bb, input_bb)
+
+ os.rename(output_file, input_file)
+ os.rename(output_file + ".csi", input_file + ".csi")
+
+ # remove tmp file
+ os.remove(tmp_bed)
+ os.remove(bed_file)
+
+if __name__ == "__main__":
+ sys.exit(main())
\ No newline at end of file