Skip to content

Commit 5cfa67e

Browse files
authored
Merge pull request #76 from gmcvicker/master
Fixes and improvements to mapping pipeline
2 parents 556055e + 3f6a327 commit 5cfa67e

17 files changed

+1932
-339
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ htmlcov/
3939
.cache
4040
nosetests.xml
4141
coverage.xml
42+
mapping/test_data
4243

4344
# Translations
4445
*.mo

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
Version 0.3.1 - August 31, 2018
2+
-----------
3+
This release makes several improvements/fixes to read filtering by the mapping pipeline. Several of these improvements were suggested by Alex Dobin for using WASP with the STAR mapper.
4+
5+
Changes include:
6+
* find_intersecting_snps.py was modified to handle dependence of alleles when both reads from a read pair overlap the same variant(s). Previously the reads that were generated for remapping were treated independently, however if read1 has alternate allele for a given SNP, so should read2 if it overlaps the same SNPs.
7+
* snp2h5 was altered to add a new phase carray to haplotype.h5 containing phase information from the VCF. find_intersecting_snps.py will now use the phase information from haplotype.h5 to calculate new haplotypes with all possible allelic combinations at unphased sites, resulting in more reads being generated for remapping. If phase information is not provided in haplotype.h5, all sites will be assumed unphased.
8+
* Supplementary and secondary alignments are now filtered by find_intersecting_snps and filter_remapped_reads
9+
* Reads whose CIGAR flags change after the remapping step are now discarded, even if they map to the same start position.
10+
* snp2h5 will now try to add/remove 'chr' from the chromosome name read from the VCF file if the original name does not match any chromosomes in the chromInfo file.
11+
12+
113
Version 0.3.0 - July 3, 2018
214
-----------
315
This release moves the codebase from Python2 to Python3 and includes several additional improvements.

CHT/README.bam2h5.md

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,6 @@ BAM files must be sorted and indexed.
6060
counts are randomly assigned to ONE of overlapping SNPs (regardless of
6161
their genotype).
6262

63-
* --samples SAMPLES_TXT_FILE [optional]
64-
65-
Path to text file containing a list of individual identifiers. The
66-
ordering of individuals must be consistent with the haplotype
67-
file. The samples file is assumed to have one identifier per line
68-
in the first column (other columns are ignored).
69-
7063
* --individual INDIVIDUAL [optional]
7164

7265
Identifier for individual, used to determine which
@@ -125,6 +118,6 @@ BAM files must be sorted and indexed.
125118
--alt_as_counts alt_as_counts.$INDIVIDUAL.h5 \
126119
--other_as_counts other_as_counts.$INDIVIDUAL.h5 \
127120
--read_counts read_counts.$INDIVIDUAL.h5 \
128-
--text_counts counts.$INDIVIDUAL.txt.gz \
121+
--txt_counts counts.$INDIVIDUAL.txt.gz \
129122
H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
130123

CHT/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ before bam2h5.py can be run.
151151
--snp_index example_data/snp_index.h5 \
152152
--snp_tab example_data/snp_tab.h5 \
153153
--haplotype example_data/haps.h5 \
154-
--samples $ALL_SAMPLES_FILE \
155154
--individual $INDIVIDUAL \
156155
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
157156
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \

CHT/bam2h5.py

Lines changed: 47 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,11 @@
6666
counts are randomly assigned to ONE of overlapping SNPs (regardless of
6767
their genotype).
6868
69-
--samples SAMPLES_TXT_FILE [optional]
70-
Path to text file containing a list of individual identifiers. The
71-
ordering of individuals must be consistent with the haplotype
72-
file. The samples file is assumed to have one identifier per line
73-
in the first column (other columns are ignored).
74-
7569
--individual INDIVIDUAL [optional]
7670
Identifier for individual, used to determine which
7771
SNPs are heterozygous. Must be provided
7872
if --haplotype argument is provided and must match one of the
79-
individuals in the file provided with --samples argument.
73+
samples in the haplotype HDF5 file.
8074
8175
Output Options:
8276
--data_type uint8|uint16
@@ -122,6 +116,9 @@
122116
import chromstat
123117
import util
124118

119+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+"/../mapping/")
120+
import snptable
121+
125122
# codes used by pysam for aligned read CIGAR strings
126123
BAM_CMATCH = 0 # M
127124
BAM_CINS = 1 # I
@@ -148,6 +145,8 @@
148145
MAX_UINT8_COUNT = 255
149146
MAX_UINT16_COUNT = 65535
150147

148+
unimplemented_CIGAR = [0, set()]
149+
151150

152151

153152
def create_carray(h5f, chrom, data_type):
@@ -283,9 +282,11 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
283282
# in read and not used in alignment
284283
pass
285284
else:
286-
sys.stderr.write("skipping because contains CIGAR code %s "
287-
" which is not currently implemented\n" %
288-
BAM_CIGAR_DICT[op])
285+
unimplemented_CIGAR[0] += 1
286+
unimplemented_CIGAR[1].add(BAM_CIGAR_DICT[op])
287+
# sys.stderr.write("skipping because contains CIGAR code %s "
288+
# " which is not currently implemented\n" %
289+
# BAM_CIGAR_DICT[op])
289290

290291
# are any of the SNPs indels? If so, discard.
291292
for i in snp_idx:
@@ -466,22 +467,11 @@ def parse_args():
466467
metavar="HAPLOTYPE_H5_FILE",
467468
default=None)
468469

469-
parser.add_argument("--samples",
470-
help="Path to text file containing a list of "
471-
"individual identifiers. The ordering of individuals "
472-
"must be consistent with the haplotype file. The "
473-
"samples file is assumed to have one identifier per "
474-
"line in the first column (other columns are "
475-
"ignored).",
476-
metavar="SAMPLES_TXT_FILE",
477-
default=None)
478-
479470
parser.add_argument("--individual",
480471
help="Identifier for individual, used to determine "
481-
"which SNPs are heterozygous. Must be provided "
482-
"if --haplotype argument is provided and must "
483-
"match one of the individuals in the file provided "
484-
"with --samples argument.",
472+
"which SNPs are heterozygous. Must be provided if "
473+
"--haplotype argument is provided and must match one "
474+
"of the samples in the haplotype HDF5 file.",
485475
metavar="INDIVIDUAL",
486476
default=None)
487477

@@ -538,8 +528,8 @@ def parse_args():
538528

539529
args = parser.parse_args()
540530

541-
if args.haplotype and (args.individual is None or args.samples is None):
542-
parser.error("--indidivual and --samples arguments "
531+
if args.haplotype and (args.individual is None):
532+
parser.error("--indidivual argument "
543533
"must also be provided when --haplotype argument "
544534
"is provided")
545535

@@ -549,53 +539,6 @@ def parse_args():
549539

550540

551541

552-
def lookup_individual_index(samples_file, ind_name, population=None):
553-
"""Gets the index of individual that is used
554-
to lookup information in the genotype and haplotype tables"""
555-
f = open(samples_file, "rt")
556-
557-
if population:
558-
p = population.lower()
559-
else:
560-
p = None
561-
562-
idx = 0
563-
for line in f:
564-
if line.startswith("samples"):
565-
# header line
566-
continue
567-
568-
words = line.rstrip().split()
569-
name = words[0].replace("NA", "")
570-
571-
if len(words) > 1:
572-
pop = words[1].lower()
573-
else:
574-
pop = ""
575-
576-
if len(words) > 2:
577-
group = words[2].lower()
578-
else:
579-
group = ""
580-
581-
# if specified, only consider a single population or group
582-
if p and pop != p and group != p:
583-
continue
584-
585-
if name == ind_name:
586-
f.close()
587-
return idx
588-
589-
idx += 1
590-
591-
592-
raise ValueError("individual %s (with population=%s) "
593-
"is not in samples file %s" %
594-
(ind_name, population, samples_file))
595-
596-
597-
598-
599542
def main():
600543
args = parse_args()
601544

@@ -612,10 +555,8 @@ def main():
612555

613556
if args.haplotype:
614557
hap_h5 = tables.open_file(args.haplotype, "r")
615-
ind_idx = lookup_individual_index(args.samples, args.individual)
616558
else:
617559
hap_h5 = None
618-
ind_idx = None
619560

620561
ref_count_h5 = tables.open_file(args.ref_as_counts, "w")
621562
alt_count_h5 = tables.open_file(args.alt_as_counts, "w")
@@ -645,10 +586,12 @@ def main():
645586
else:
646587
raise NotImplementedError("unsupported datatype %s" % args.data_type)
647588

648-
# create a list to hold the counts that will be later written
649-
# to a txt file
650-
if args.text_counts is not None:
651-
txt_counts = list()
589+
# create a txt file to also holds the counts
590+
if args.txt_counts is not None:
591+
if os.path.splitext(args.txt_counts)[1] == ".gz":
592+
txt_counts = gzip.open(args.txt_counts, 'a+')
593+
else:
594+
txt_counts = open(args.txt_counts, 'a+')
652595

653596
for chrom in chrom_list:
654597
sys.stderr.write("%s\n" % chrom.name)
@@ -666,8 +609,18 @@ def main():
666609
snp_index_array = snp_index_h5.get_node("/%s" % chrom.name)[:]
667610
if hap_h5:
668611
hap_tab = hap_h5.get_node("/%s" % chrom.name)
612+
ind_idx = snptable.SNPTable().get_h5_sample_indices(
613+
hap_h5, chrom, [args.individual]
614+
)[1]
615+
if len(ind_idx) != 0:
616+
ind_idx = ind_idx[0]
617+
else:
618+
hap_tab = None
619+
ind_idx = None
669620
else:
670621
hap_tab = None
622+
ind_idx = None
623+
671624

672625
# initialize count arrays for this chromosome to 0
673626
ref_carray = get_carray(ref_count_h5, chrom)
@@ -709,7 +662,7 @@ def main():
709662
# file later
710663
# columns are:
711664
# chrom, pos, ref, alt, genotype, ref_count, alt_count, other_count
712-
if args.text_counts is not None:
665+
if args.txt_counts is not None:
713666
chrom = np.tile(chrom.name, len(snp_tab))
714667
pos = np.array([snp['pos'] for snp in snp_tab])
715668
ref = np.array([snp['allele1'] for snp in snp_tab])
@@ -719,22 +672,26 @@ def main():
719672
for hap in hap_tab])
720673
else:
721674
genotype = np.empty((len(snp_tab), 0))
722-
txt_counts.append(
675+
# write an np array to a txt file
676+
np.savetxt(
677+
txt_counts,
723678
np.column_stack((chrom, pos, ref, alt, genotype,
724-
ref_array[pos-1],
725-
alt_array[pos-1],
726-
other_array[pos-1]))
679+
ref_array[pos-1],
680+
alt_array[pos-1],
681+
other_array[pos-1])),
682+
fmt="%1s",
683+
delimiter=" "
727684
)
728685

729686

730687
samfile.close()
731688

732-
# write the txt_counts np arrays to a txt file
733-
if args.text_counts is not None:
734-
# we use vstack to combine np arrays row-wise into a multi-dimensional
735-
# array
736-
np.savetxt(args.text_counts, np.vstack(tuple(txt_counts)),
737-
fmt="%1s", delimiter=" ")
689+
if args.txt_counts:
690+
# close the open txt file handler
691+
txt_counts.close()
692+
693+
# check if any of the reads contained an unimplemented CIGAR
694+
sys.stderr.write("WARNING: Encountered "+str(unimplemented_CIGAR[0])+" instances of any of the following CIGAR codes: "+str(unimplemented_CIGAR[1])+". The regions of reads with these CIGAR codes were skipped because these CIGAR codes are currently unimplemented.\n")
738695

739696
# set track statistics and close HDF5 files
740697

examples/example_cht_workflow.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ do
5151
--snp_index $DATA_DIR/snp_index.h5 \
5252
--snp_tab $DATA_DIR/snp_tab.h5 \
5353
--haplotype $DATA_DIR/haps.h5 \
54-
--samples $ALL_SAMPLES_FILE \
5554
--individual $INDIVIDUAL \
5655
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
5756
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \

mapping/Snakefile

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -216,20 +216,29 @@ rule rmdup_pe:
216216
rule get_as_counts:
217217
"""get allele-specific read counts for SNPs"""
218218
input:
219-
bam=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam",
220-
snp_index=config["snp_h5_dir"] + "/snp_index.h5",
221-
snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
222-
haplotype=config['snp_h5_dir'] + "/haplotype.h5",
219+
bam=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam",
220+
snp_index=config["snp_h5_dir"] + "/snp_index.h5",
221+
snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
222+
haplotype=config['snp_h5_dir'] + "/haplotype.h5",
223+
chrom=config['chrom_info']
223224
params:
224-
samp1kg=lambda wildcards: SAMP_TO_1KG[wildcards.sample]
225+
samp1kg=lambda wildcards: SAMP_TO_1KG[wildcards.sample]
225226
output:
226-
config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz"
227+
ref_as=config['output_dir'] + "/as_counts/{sample}.ref_as_counts.h5",
228+
alt_as=config['output_dir'] + "/as_counts/{sample}.alt_as_counts.h5",
229+
other_as=config['output_dir'] + "/as_counts/{sample}.other_as_counts.h5",
230+
read_counts=config['output_dir'] + "/as_counts/{sample}.read_counts.h5",
231+
txt_counts=config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz"
227232
shell:
228-
"python {config[wasp_dir]}/mapping/get_as_counts.py "
229-
" --snp_tab {input.snp_tab} "
233+
"python {config[wasp_dir]}/CHT/bam2h5.py "
234+
" --chrom {input.chrom} "
230235
" --snp_index {input.snp_index} "
236+
" --snp_tab {input.snp_tab} "
231237
" --haplotype {input.haplotype} "
232-
" --samples {config[sample_file]} "
233-
" --genotype_sample {params.samp1kg} "
234-
" {input.bam} | gzip > {output}"
235-
238+
" --individual {params.samp1kg} "
239+
" --ref_as_counts {output.ref_as} "
240+
" --alt_as_counts {output.alt_as} "
241+
" --other_as_counts {output.other_as} "
242+
" --read_counts {output.read_counts} "
243+
" --txt_counts {output.txt_counts} "
244+
"{input.bam}"

0 commit comments

Comments
 (0)