Skip to content

Commit bcf6c62

Browse files
committed
Release 1.21
2 parents 02ee548 + b411171 commit bcf6c62

File tree

142 files changed

+4543
-906
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+4543
-906
lines changed

.cirrus.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ gcc_task:
6060
LC_ALL: C
6161
CIRRUS_CLONE_DEPTH: 1
6262
HTSDIR: ./htslib
63+
CFLAGS: -fsanitize=address
64+
LDFLAGS: -fsanitize=address
6365

6466
matrix:
6567
- environment:
@@ -82,6 +84,7 @@ ubuntu_task:
8284

8385
environment:
8486
CC: clang
87+
CFLAGS: -g -O2 -Werror -Wall -Wformat -Wformat=2
8588
LC_ALL: C
8689
CIRRUS_CLONE_DEPTH: 1
8790
HTSDIR: ./htslib
@@ -93,8 +96,8 @@ ubuntu_task:
9396
memory: 2G
9497
environment:
9598
USE_CONFIG: yes
96-
CFLAGS: -g -Wall -O3 -fsanitize=address
97-
LDFLAGS: -fsanitize=address -Wl,-rpath,`pwd`/inst/lib
99+
CFLAGS: -g -Wall -O3
100+
LDFLAGS: -Wl,-rpath,`pwd`/inst/lib
98101

99102
# NB: we could consider building a docker image with these
100103
# preinstalled and specifying that instead, to speed up testing.

HMM.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ double *hmm_get_fwd_bwd_prob(hmm_t *hmm);
127127
* @sites: list of positions
128128
*
129129
* Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
130-
* transition probabilities is returned. In this verison, emission
130+
* transition probabilities is returned. In this version, emission
131131
* probabilities are not updated.
132132
*/
133133
double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);

Makefile

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ OBJS = main.o vcfindex.o tabix.o \
4141
vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \
4242
regidx.o smpl_ilist.o csq.o vcfbuf.o \
4343
mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o bam2bcf_edlib.o \
44-
read_consensus.o bam_sample.o \
44+
read_consensus.o bam_sample.o \
4545
vcfsort.o cols.o extsort.o dist.o abuf.o \
4646
ccall.o em.o prob1.o kmin.o str_finder.o gff.o edlib.o
4747
PLUGIN_OBJS = vcfplugin.o
@@ -105,7 +105,7 @@ endif
105105

106106
include config.mk
107107

108-
PACKAGE_VERSION = 1.20
108+
PACKAGE_VERSION = 1.21
109109

110110
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
111111
# description of the working tree: either a release tag with the same value
@@ -235,7 +235,6 @@ vcfbuf_h = vcfbuf.h $(htslib_vcf_h)
235235
abuf_h = abuf.h $(htslib_vcf_h)
236236
dbuf_h = dbuf.h $(htslib_vcf_h)
237237
bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
238-
edlib.h = edlib.h
239238
bam_sample_h = bam_sample.h $(htslib_sam_h)
240239
cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h)
241240
read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h)
@@ -249,17 +248,17 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade
249248
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) $(bcftools_h)
250249
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
251250
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h
252-
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h filter.h
251+
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(htslib_bgzf_h) $(bcftools_h) extsort.h $(filter_h)
253252
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
254-
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
255-
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h) $(htslib_kbitset_h)
256-
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h
253+
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h) $(filter_h)
254+
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(htslib_kbitset_h) $(htslib_hts_endian_h) $(bcftools_h) regidx.h vcmp.h $(htslib_khash_h)
255+
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h regidx.h $(filter_h)
257256
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
258257
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
259-
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
258+
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(htslib_hts_defs_h) $(bcftools_h) HMM.h rbuf.h
260259
vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h)
261-
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h)
262-
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h)
260+
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(bcftools_h)
261+
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) $(htslib_bgzf_h) kheap.h $(bcftools_h)
263262
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h
264263
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
265264
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) $(khash_str2str_h)
@@ -276,7 +275,7 @@ mcall.o: mcall.c $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(call_h) $(prob1_h
276275
prob1.o: prob1.c $(prob1_h)
277276
vcmp.o: vcmp.c $(htslib_hts_h) $(htslib_vcf_h) vcmp.h
278277
ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h)
279-
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h
278+
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_defs_h) $(bcftools_h) peakfit.h
280279
peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h)
281280
bin.o: bin.c $(bcftools_h) bin.h
282281
dist.o: dist.c dist.h
@@ -287,14 +286,15 @@ mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(hts
287286
bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h
288287
bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h)
289288
bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h)
290-
bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) $(edlib.h)
289+
bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(str_finder_h) $(htslib_ksort_h) edlib.h
291290
read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h
292291
bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h)
293292
version.o: version.h version.c
294293
hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h
295294
HMM.o: HMM.c $(htslib_hts_h) HMM.h
296-
vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(vcfbuf_h) rbuf.h
295+
vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(htslib_kbitset_h) $(bcftools_h) $(vcfbuf_h) rbuf.h
297296
abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h
297+
edlib.o: edlib.c edlib.h
298298
extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h
299299
smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
300300
gff.o: gff.c $(htslib_hts_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) gff.h regidx.h
@@ -326,7 +326,7 @@ test/test-rbuf.o: test/test-rbuf.c rbuf.h
326326
test/test-rbuf: test/test-rbuf.o
327327
$(CC) $(LDFLAGS) -o $@ $^ $(ALL_LIBS)
328328

329-
test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) regidx.h
329+
test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) $(htslib_hts_os_h) $(htslib_hts_defs_h) regidx.h
330330

331331
test/test-regidx: test/test-regidx.o regidx.o | $(HTSLIB)
332332
$(CC) $(ALL_LDFLAGS) -o $@ $^ $(HTSLIB_LIB) -lpthread $(ALL_LIBS)

NEWS

Lines changed: 124 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,124 @@
1+
## Release 1.21 (12th September 2024)
2+
3+
4+
5+
Changes affecting the whole of bcftools, or multiple commands:
6+
7+
* Support multiple semicolon-separated strings when filtering by ID using -i/-e (#2190).
8+
For example, `-i 'ID="rs123"'` now correctly matches `rs123;rs456`
9+
10+
* The filtering expression ILEN can be positive (insertion), negative (deletion), zero
11+
(balanced substitutions), or set to missing value (symbolic alleles).
12+
13+
* bcftools query
14+
* bcftools +split-vep
15+
16+
- The columns indices printed by default with `-H` (e.g., "#[1]CHROM") can be now
17+
suppressed by giving the option twice `-HH` (#2152)
18+
19+
20+
Changes affecting specific commands:
21+
22+
* bcftools annotate
23+
24+
- Support dynamic variables read from a tab-delimited annotation file (#2151)
25+
For example, in the two cases below the field 'STR' from the -a file is required to match
26+
the INFO/TAG in VCF. In the first example the alleles REF,ALT must match, in the second
27+
example they are ignored. The option -k is required to output also records that were not
28+
annotated:
29+
30+
bcftools annotate -a ann.tsv.gz -c CHROM,POS,REF,ALT,SCORE,~STR -i'TAG={STR}' -k in.vcf
31+
bcftools annotate -a ann.tsv.gz -c CHROM,POS,-,-,SCORE,~STR -i'TAG={STR}' -k in.vcf
32+
33+
- When adding Type=String annotations from a tab-delimited file, encode characters with
34+
special meaning using percent encoding (';', '=' in INFO and ':' in FORMAT) (#2202)
35+
36+
* bcftools consensus
37+
38+
- Allow to apply a reference allele which overlaps a previous deletion, there is no
39+
need to complain about overlapping alleles in such case
40+
41+
- Fix a bug which required `-s -` to be present even when there were no samples in the VCF
42+
(#2260)
43+
44+
* bcftools csq
45+
46+
- Fix a rare bug where indel combined with a substitution ending at exon boundary is
47+
incorrectly predicted to have 'inframe' rather than 'frameshift' consequence (#2212)
48+
49+
* bcftools gtcheck
50+
51+
- Fix a segfault with --no-HWE-prob. The bug was introduced with the output format change in
52+
1.19 which replaced the DC section with DCv2 (#2180)
53+
54+
- The number of matching genotypes in the DCv2 output was not calculated correctly with
55+
non-zero `-E, --error-probability`. Consequently, also the average HWE score was incorrect.
56+
The main output, the discordance score, was not affected by the bug
57+
58+
* bcftools +mendelian2
59+
60+
- Include the number of good cases where at least one of the trio genotypes has an alternate
61+
allele (#2204)
62+
63+
- Fix the error message which would report the wrong sample when non-existent sample is given.
64+
Note that bug only affected the error message, the program otherwise assigns the family
65+
members correctly (#2242)
66+
67+
* bcftools merge
68+
69+
- Fix a severe bug in merging of FORMAT fields with Number=R and Number=A values. For example,
70+
rows with high-coverage FORMAT/AD values (bigger or equal to 128) could have been assigned
71+
to incorrect samples. The bug was introduced in version 1.19. For details see #2244.
72+
73+
* bcftools mpileup
74+
75+
- Return non-zero error code when the input BAM/CRAM file is truncated (#2177)
76+
77+
- Add FORMAT/AD annotation by default, disable with `-a -AD`
78+
79+
* bcftools norm
80+
81+
- Support realignment of symbolic <DUP.*> alleles, similarly to <DEL.*> added previously
82+
(#1919,#2145)
83+
84+
- Fix in reporting reference allele genotypes with `--multi-overlaps .` (#2160)
85+
86+
- Support of duplicate removal of symbolic alleles of the same type but different SVLEN (#2182)
87+
88+
- New `-S, --sort` switch to optionally sort output records by allele (#1484)
89+
90+
- Add the `-i/-e` filtering options to select records for normalization. Note duplicate
91+
removal ignores this option.
92+
93+
- Fix a bug where `--atomize` would not fill GT alleles for atomized SNVs followed by
94+
an indel (#2239)
95+
96+
* bcftools +remove-overlaps
97+
98+
- Revamp the program to allow greater flexibility, with the following new options:
99+
100+
-M, --mark-tag TAG Mark -m sites with INFO/TAG
101+
-m, --mark EXPR Mark (if also -M is present) or remove sites [overlap]
102+
dup .. all overlapping sites
103+
overlap .. overlapping sites
104+
min(QUAL) .. mark sites with lowest QUAL until overlaps are resolved
105+
--missing EXPR Value to use for missing tags with -m 'min(QUAL)'
106+
0 .. the default
107+
DP .. heuristics, scale maximum QUAL value proportionally to INFO/DP
108+
--reverse Apply the reverse logic, for example preserve duplicates instead of removing
109+
-O, --output-type t t: plain list of sites (chr,pos), tz: compressed list
110+
111+
* bcftools +tag2tag
112+
113+
- The conversions --LXX-to-XX, --XX-to-LXX were working but specific cases such as --LAD-to-AD were not.
114+
115+
- Print more informative error message when source tag type violiates VCF specification
116+
117+
* bcftools +trio-dnm2
118+
119+
- Better handling of the --strictly-novel functionality, especically with respect to chrX inheritance
120+
121+
1122
## Release 1.20 (15th April 2024)
2123

3124

@@ -716,7 +837,7 @@ Changes affecting specific commands:
716837
annotating from a tab-delimited text file, this feature can be invoked
717838
by using `-c INFO/END`.
718839

719-
- add a new '.' modifier to control wheter missing values should be carried
840+
- add a new '.' modifier to control whether missing values should be carried
720841
over from a tab-delimited file or not. For example:
721842

722843
-c TAG .. adds TAG if the source value is not missing. If TAG
@@ -1068,7 +1189,7 @@ Changes affecting specific commands:
10681189

10691190
* bcftools csq:
10701191

1071-
- Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too
1192+
- Fix a bug which caused incorrect FORMAT/BCSQ formatting at sites with too
10721193
many per-sample consequences
10731194

10741195
- Fix a bug which incorrectly handled the --ncsq parameter and could clash
@@ -1785,7 +1906,7 @@ Updates, improvements and bugfixes for many other commands:
17851906

17861907
* `roh`: Now possible to process multiple samples at once. This allows
17871908
considerable speedups for files with thousands of samples where the cost of
1788-
HMM is neglibible compared to I/O and decompressing. In order to fit tens of
1909+
HMM is negligible compared to I/O and decompressing. In order to fit tens of
17891910
thousands samples in memory, a sliding HMM can be used (new `--buffer-size`
17901911
option). Viterbi training now uses Baum-Welch algorithm, and works much
17911912
better. Support for gVCFs or FORMAT/PL tags. Added `-o, output` and

abuf.c

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ typedef struct
4343
kstring_t ref, alt;
4444
int ial; // the index of the original ALT allele, 1-based
4545
int beg, end; // 0-based inclusive offsets to ref,alt
46+
int plen; // the ref,alt prefix length, eg plen=1 for C>CA
4647
}
4748
atom_t;
4849

@@ -175,8 +176,9 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
175176
atom->alt.l = 0;
176177
kputc(refb, &atom->ref);
177178
kputc(refb, &atom->alt);
178-
atom->beg = atom->end = i;
179-
atom->ial = ial;
179+
atom->beg = atom->end = i;
180+
atom->ial = ial;
181+
atom->plen = 1;
180182
}
181183
continue;
182184
}
@@ -202,6 +204,35 @@ static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
202204
if ( rcmp ) return rcmp;
203205
return strcasecmp(a->alt.s,b->alt.s);
204206
}
207+
208+
// returns
209+
// 0 .. identical beg,ref,alt
210+
// 1 .. non-overlapping variants, but record may overlap (A>AT vs A>C)
211+
// 2 .. overlapping (conflicting) variants
212+
static int _atoms_overlap(const atom_t *a, const atom_t *b)
213+
{
214+
if ( a->beg < b->beg ) return 2;
215+
if ( a->beg > b->beg ) return 2;
216+
217+
// consider SNV followed by DEL as not overlapping
218+
// CC > C a.plen=1 (ref,alt prefix len=1)
219+
// C > T b.plen=0 (ref,alt prefix len=0)
220+
if ( a->plen && a->plen >= b->ref.l ) return 1;
221+
if ( b->plen && b->plen >= a->ref.l ) return 1;
222+
223+
int rcmp = strcasecmp(a->ref.s,b->ref.s);
224+
if ( rcmp ) return 2;
225+
226+
// consider SNV followed by INS as not overlapping
227+
// A > AT a.plen=1 (ref,alt prefix len=1)
228+
// A > C b.plen=0 (ref,alt prefix len=0)
229+
if ( a->plen && a->plen >= b->alt.l ) return 1;
230+
if ( b->plen && b->plen >= a->alt.l ) return 1;
231+
232+
rcmp = strcasecmp(a->alt.s,b->alt.s);
233+
if ( rcmp ) return 2;
234+
return 0;
235+
}
205236
/*
206237
For reproducibility of tests on different platforms, we need to guarantee the same order of identical
207238
atoms originating from different source ALTs. Even though they are consistent, different values can be
@@ -238,7 +269,14 @@ static void _split_table_new(abuf_t *buf, atom_t *atom)
238269
static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
239270
{
240271
uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
241-
ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
272+
int olap = _atoms_overlap(atom,buf->split.atoms[iout]);
273+
ptr[atom->ial-1] = olap > 1 ? 2 : 1;
274+
275+
// The test test/atomize.split.5.vcf shows why we sometimes can and sometimes
276+
// cannot remove the star allele like this
277+
// buf->split.overlaps[iout] = olap > 1 ? 1 : 0;
278+
// I forgot the details of the code, so don't immediately see
279+
// if this could be made smarter
242280
buf->split.overlaps[iout] = 1;
243281
}
244282
#if 0
@@ -745,7 +783,7 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec)
745783
_split_table_init(buf,rec,buf->natoms);
746784
for (i=0; i<buf->natoms; i++)
747785
{
748-
if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
786+
if ( i && _atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i])==0 ) continue;
749787
_split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
750788
}
751789
for (i=0; i<buf->natoms; i++)

0 commit comments

Comments
 (0)