Skip to content

Commit c15697c

Browse files
committed
add CLEAN tag
1 parent e195c43 commit c15697c

File tree

4 files changed

+5
-2
lines changed

4 files changed

+5
-2
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
* Fix variant calling close to start/end boundaries of chromosomes
1616
* Add `--out-sv-rnames` and `--out-som-sv-rnames` to output SV-supporing read names (tag: `SVREADS`) in FORMAT field of VCF
17-
<!-- * Add --STR -->
17+
* Add `CLEAN` INFO tag for variants in clean regions, i.e., SNPs or simple small indels (≤5bp) in non-repetitive regions which are generally more reliable, to help with downstream filtering and benchmarking
1818

1919

2020
## Getting Started

src/call_var_main.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ typedef struct {
109109
uint8_t type; // BAM_CINS/BAM_CDEL/BAM_CDIFF
110110
hts_pos_t pos, PS; // phase set
111111
uint8_t *ref_bases; int ref_len;
112-
uint8_t is_somatic, is_sv;
112+
uint8_t is_somatic, is_sv, is_clean; // clean: SNP or simple indel in non-repetitive region
113113
uint8_t *tsd_seq; int tsd_len, polya_len; hts_pos_t tsd_pos1, tsd_pos2; // target site duplication, 2 TSDs for DEL
114114
int te_seq_i, te_is_rev; // char *rep_name, *rep_family, *rep_class; use te_seq_i to retrieve TE sequence info
115115

src/collect_var.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,6 +1412,7 @@ int make_variants(const call_var_opt_t *opt, bam_chunk_t *chunk, var_t **_var) {
14121412
}
14131413
var->vars[i].n_alt_allele = 0;
14141414
var->vars[i].is_sv = 0;
1415+
var->vars[i].is_clean = (chunk->var_i_to_cate[cand_i] & LONGCALLD_CAND_GERMLINE_CLEAN_VAR_CATE) != 0;
14151416
for (int hap=1; hap <= 2; ++hap) {
14161417
int hap_alle = hap == 1 ? hap1_alle : hap2_alle;
14171418
if (hap_alle != 0) { // alt allele

src/vcf_utils.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ void write_vcf_header(bam_hdr_t *hdr, struct call_var_opt_t *opt) {
5858
// INFO fields
5959
bcf_hdr_append(vcf_hdr, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
6060
bcf_hdr_append(vcf_hdr, "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Somatic/mosaic variant\">");
61+
bcf_hdr_append(vcf_hdr, "##INFO=<ID=CLEAN,Number=0,Type=Flag,Description=\"Clean-region variant (SNP or simple indel in non-repetitive region)\">");
6162
bcf_hdr_append(vcf_hdr, "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
6263
bcf_hdr_append(vcf_hdr, "##INFO=<ID=SVLEN,Number=A,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">");
6364
// TSD info
@@ -177,6 +178,7 @@ int write_var_to_vcf(var_t *vars, const struct call_var_opt_t *opt, bam_chunk_t
177178

178179
// Write QUAL, FILTER, INFO
179180
len += snprintf(buffer + len, buf_m - len, "\t%d\tPASS\t", var.QUAL);
181+
if (var.is_clean) len += snprintf(buffer + len, buf_m - len, "CLEAN;");
180182
if (var.is_somatic) len += snprintf(buffer + len, buf_m - len, "SOMATIC;");
181183
if (var.te_seq_i >= 0) len += snprintf(buffer + len, buf_m - len, "MEI;");
182184
len += snprintf(buffer + len, buf_m - len, "END=%" PRId64 "", var.pos + var.ref_len - 1);

0 commit comments

Comments
 (0)