Skip to content

Commit 6921fd6

Browse files
authored
Merge pull request #228 from ENCODE-DCC/dev
v1.8.1
2 parents b4ffdfb + 241a1b9 commit 6921fd6

17 files changed

+637
-415
lines changed

.circleci/config.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,4 +285,3 @@ workflows:
285285
- test_workflow_true_rep_only_pbam_pe:
286286
requires:
287287
- build
288-

chip.wdl

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
version 1.0
22

33
workflow chip {
4-
String pipeline_ver = 'v1.8.0'
4+
String pipeline_ver = 'v1.8.1'
55

66
meta {
7-
version: 'v1.8.0'
7+
version: 'v1.8.1'
88
author: 'Jin wook Lee (leepc12@gmail.com) at ENCODE-DCC'
99
description: 'ENCODE TF/Histone ChIP-Seq pipeline'
1010
specification_document: 'https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit?usp=sharing'
1111

12-
caper_docker: 'encodedcc/chip-seq-pipeline:v1.8.0'
13-
caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.8.0'
12+
caper_docker: 'encodedcc/chip-seq-pipeline:v1.8.1'
13+
caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.8.1'
1414
croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.v5.json'
1515

1616
parameter_group: {
@@ -2463,20 +2463,22 @@ task count_signal_track {
24632463
File? ta # tag-align
24642464
File chrsz # 2-col chromosome sizes file
24652465
}
2466+
Float mem_gb = 8.0
24662467

24672468
command {
24682469
set -e
24692470
python3 $(which encode_task_count_signal_track.py) \
24702471
${ta} \
2471-
${'--chrsz ' + chrsz}
2472+
${'--chrsz ' + chrsz} \
2473+
${'--mem-gb ' + mem_gb}
24722474
}
24732475
output {
24742476
File pos_bw = glob('*.positive.bigwig')[0]
24752477
File neg_bw = glob('*.negative.bigwig')[0]
24762478
}
24772479
runtime {
24782480
cpu : 1
2479-
memory : '8 GB'
2481+
memory : '${mem_gb} GB'
24802482
time : 4
24812483
disks : 'local-disk 50 SSD'
24822484
}
@@ -2547,7 +2549,8 @@ task call_peak {
25472549
${'--chrsz ' + chrsz} \
25482550
${'--fraglen ' + fraglen} \
25492551
${'--cap-num-peak ' + cap_num_peak} \
2550-
${'--pval-thresh '+ pval_thresh}
2552+
${'--pval-thresh '+ pval_thresh} \
2553+
${'--mem-gb ' + mem_gb}
25512554
25522555
elif [ '${peak_caller}' == 'spp' ]; then
25532556
python3 $(which encode_task_spp.py) \
@@ -2614,7 +2617,8 @@ task macs2_signal_track {
26142617
${'--gensz '+ gensz} \
26152618
${'--chrsz ' + chrsz} \
26162619
${'--fraglen ' + fraglen} \
2617-
${'--pval-thresh '+ pval_thresh}
2620+
${'--pval-thresh '+ pval_thresh} \
2621+
${'--mem-gb ' + mem_gb}
26182622
}
26192623
output {
26202624
File pval_bw = glob('*.pval.signal.bigwig')[0]

dev/build_on_dx_dockerhub.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ set -e
33

44
VER=$(cat chip.wdl | grep "String pipeline_ver = " | awk '{gsub("'"'"'",""); print $4}')
55
DOCKER=encodedcc/chip-seq-pipeline:$VER
6-
DXWDL=~/dxWDL-v1.46.4.jar
6+
DXWDL=~/dxWDL-v1.50.jar
77

88
# check if docker image exists on dockerhub
99
docker pull $DOCKER

src/encode_lib_common.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,34 @@ def copy_f_to_dir(f, out_dir): # copy 'f' to 'out_dir'/'f'
280280
return copy_f_to_f(f, dest)
281281

282282

283+
def get_gnu_sort_param(max_mem_job, ratio=0.5):
284+
"""Get a string of parameters for GNU sort according to maximum memory of a job/instance.
285+
286+
For GNU `sort`, `-S` or `--buffer-size` defines the buffer size for the sorting,
287+
which defaults to max(available_mem, 1/8 * total_mem) of a node/instance.
288+
289+
sort -S [SIZE][UNIT] ...
290+
291+
See the following link for details.
292+
https://github.com/coreutils/coreutils/blob/master/src/sort.c#L1492
293+
294+
This can be a problem if a job is assigned with a limited amount of memory,
295+
but the job runs on a large node (e.g. 256GB of memory).
296+
297+
`-S` defines an INITIAL buffer size and it will automatically grow
298+
if more memory is needed by `sort`.
299+
300+
301+
Args:
302+
max_mem_job:
303+
Maximum amount of memory for a job/instance in bytes.
304+
ratio:
305+
Ratio to define the buffer size according to `max_mem_job`.
306+
"""
307+
mem_mb = int(math.ceil(max_mem_job * ratio / (1024 * 1024)))
308+
return '-S {mem_mb}M'.format(mem_mb=mem_mb)
309+
310+
283311
def now():
284312
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
285313

src/encode_lib_genomic.py

Lines changed: 73 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import subprocess
1111

1212
from encode_lib_common import (
13+
get_gnu_sort_param,
1314
get_num_lines,
1415
get_peak_type,
1516
human_readable_number,
@@ -324,7 +325,7 @@ def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir):
324325
# convert encode peak file to hammock (for Wash U browser track)
325326

326327

327-
def peak_to_hammock(peak, out_dir):
328+
def peak_to_hammock(peak, mem_gb, out_dir):
328329
peak_type = get_peak_type(peak)
329330
prefix = os.path.join(out_dir, os.path.basename(
330331
strip_ext_peak(peak)))
@@ -335,14 +336,26 @@ def peak_to_hammock(peak, out_dir):
335336
hammock_gz_tbi = '{}.gz.tbi'.format(hammock)
336337

337338
if get_num_lines(peak) == 0:
338-
cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz)
339-
run_shell_cmd(cmd)
340-
cmd2 = 'touch {}'.format(hammock_gz_tbi)
339+
run_shell_cmd(
340+
'zcat -f {peak} | gzip -nc > {hammock_gz}'.format(
341+
peak=peak,
342+
hammock_gz=hammock_gz,
343+
)
344+
)
345+
run_shell_cmd(
346+
'touch {hammock_gz_tbi}'.format(
347+
hammock_gz_tbi=hammock_gz_tbi,
348+
)
349+
)
341350
else:
342-
cmd = "zcat -f {} | "
343-
cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
344-
cmd = cmd.format(peak, hammock_tmp)
345-
run_shell_cmd(cmd)
351+
run_shell_cmd(
352+
'zcat -f {peak} | '
353+
'LC_COLLATE=C sort -k1,1V -k2,2n {sort_param} > {hammock_tmp}'.format(
354+
peak=peak,
355+
sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
356+
hammock_tmp=hammock_tmp,
357+
)
358+
)
346359

347360
with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout:
348361
id = 1
@@ -390,13 +403,22 @@ def peak_to_hammock(peak, out_dir):
390403

391404
fout.write('\n')
392405

393-
cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
394-
cmd2 = cmd2.format(hammock_tmp2, hammock_gz)
395-
run_shell_cmd(cmd2)
396-
cmd3 = 'tabix -f -p bed {}'.format(hammock_gz)
397-
run_shell_cmd(cmd3)
406+
run_shell_cmd(
407+
'zcat -f {hammock_tmp2} | sort -k1,1 -k2,2n {sort_param} | '
408+
'bgzip -cf > {hammock_gz}'.format(
409+
hammock_tmp2=hammock_tmp2,
410+
sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
411+
hammock_gz=hammock_gz,
412+
)
413+
)
414+
run_shell_cmd(
415+
'tabix -f -p bed {hammock_gz}'.format(
416+
hammock_gz=hammock_gz,
417+
)
418+
)
398419

399420
rm_f([hammock, hammock_tmp, hammock_tmp2])
421+
400422
return (hammock_gz, hammock_gz_tbi)
401423

402424

@@ -435,7 +457,7 @@ def starch_to_bed_gz(starch, out_dir):
435457
return bed_gz
436458

437459

438-
def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
460+
def peak_to_bigbed(peak, peak_type, chrsz, mem_gb, out_dir):
439461
prefix = os.path.join(out_dir,
440462
os.path.basename(strip_ext(peak)))
441463
bigbed = '{}.{}.bb'.format(prefix, peak_type)
@@ -460,7 +482,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
460482
int peak; "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
461483
)
462484
'''
463-
bed_param = '-type=bed6+4 -as={}'.format(as_file)
485+
bed_param = '-type=bed6+4 -as={as_file}'.format(as_file=as_file)
464486
elif peak_type.lower() == 'broadpeak':
465487
as_file_contents = '''table broadPeak
466488
"BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
@@ -476,7 +498,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
476498
float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
477499
)
478500
'''
479-
bed_param = '-type=bed6+3 -as={}'.format(as_file)
501+
bed_param = '-type=bed6+3 -as={as_file}'.format(as_file=as_file)
480502
elif peak_type.lower() == 'gappedpeak':
481503
as_file_contents = '''table gappedPeak
482504
"This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
@@ -498,26 +520,47 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
498520
float qValue; "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
499521
)
500522
'''
501-
bed_param = '-type=bed12+3 -as={}'.format(as_file)
523+
bed_param = '-type=bed12+3 -as={as_file}'.format(as_file=as_file)
502524
else:
503-
raise Exception('Unsupported peak file type {}!'.format(peak_type))
525+
raise Exception('Unsupported peak file type {peak_type}!'.format(peak_type=peak_type))
504526

505527
# create temporary .as file
506528
with open(as_file, 'w') as fp:
507529
fp.write(as_file_contents)
508530

509-
cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp)
510-
run_shell_cmd(cmd1)
511-
cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | "
512-
cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
513-
cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}'
514-
cmd2 = cmd2.format(peak, bigbed_tmp)
515-
run_shell_cmd(cmd2)
516-
cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2)
517-
run_shell_cmd(cmd3)
518-
cmd4 = "bedToBigBed {} {} {} {}".format(
519-
bed_param, bigbed_tmp2, chrsz_tmp, bigbed)
520-
run_shell_cmd(cmd4)
531+
run_shell_cmd(
532+
'cat {chrsz} > {chrsz_tmp}'.format(
533+
chrsz=chrsz,
534+
chrsz_tmp=chrsz_tmp,
535+
)
536+
)
537+
538+
run_shell_cmd(
539+
'zcat -f {peak} | LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} | '
540+
'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
541+
'if ($5<0) $5=0; print $0}}\' > {bigbed_tmp}'.format(
542+
peak=peak,
543+
sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
544+
bigbed_tmp=bigbed_tmp,
545+
)
546+
)
547+
548+
run_shell_cmd(
549+
'bedClip {bigbed_tmp} {chrsz_tmp} {bigbed_tmp2}'.format(
550+
bigbed_tmp=bigbed_tmp,
551+
chrsz_tmp=chrsz_tmp,
552+
bigbed_tmp2=bigbed_tmp2,
553+
)
554+
)
555+
556+
run_shell_cmd(
557+
'bedToBigBed {bed_param} {bigbed_tmp2} {chrsz_tmp} {bigbed}'.format(
558+
bed_param=bed_param,
559+
bigbed_tmp2=bigbed_tmp2,
560+
chrsz_tmp=chrsz_tmp,
561+
bigbed=bigbed,
562+
)
563+
)
521564

522565
# remove temporary files
523566
rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2])

src/encode_task_bwa.py

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
import argparse
1010
from encode_lib_common import (
1111
get_num_lines, log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_fastq,
12-
strip_ext_tar, untar)
12+
strip_ext_tar, untar,
13+
get_gnu_sort_param,
14+
)
1315
from encode_lib_genomic import (
1416
get_read_length, samtools_sort, bam_is_empty, get_samtools_res_param)
1517

@@ -43,7 +45,8 @@ def parse_arguments():
4345
parser.add_argument('--nth', type=int, default=1,
4446
help='Number of threads to parallelize.')
4547
parser.add_argument('--mem-gb', type=float,
46-
help='Max. memory for samtools sort in GB. '
48+
help='Max. memory for samtools sort and GNU sort -S '
49+
'(half of this value will be used for GNU sort) in GB. '
4750
'It should be total memory for this task (not memory per thread).')
4851
parser.add_argument('--out-dir', default='', type=str,
4952
help='Output directory.')
@@ -147,27 +150,30 @@ def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
147150
sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
148151
sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)
149152

150-
cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
151-
ref_index_prefix,
152-
sai1,
153-
sai2,
154-
fastq1,
155-
fastq2,
156-
sam)
153+
cmd = 'bwa sampe {ref_index_prefix} {sai1} {sai2} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
154+
ref_index_prefix=ref_index_prefix,
155+
sai1=sai1,
156+
sai2=sai2,
157+
fastq1=fastq1,
158+
fastq2=fastq2,
159+
sam=sam,
160+
)
157161
temp_files.extend([sai1, sai2, sam])
158162
run_shell_cmd(cmd)
159163

160-
cmd2 = 'zcat -f {} | '
161-
cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
162-
cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
163-
cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
164-
cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
165-
cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
166-
cmd2 += 'sort | uniq > {}'
167-
cmd2 = cmd2.format(
168-
sam,
169-
badcigar)
170-
run_shell_cmd(cmd2)
164+
run_shell_cmd(
165+
'zcat -f {sam} | '
166+
'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
167+
'{{ cigar=$6; gsub("[0-9]+D","",cigar); '
168+
'n = split(cigar,vals,"[A-Z]"); s = 0; '
169+
'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
170+
'if (s!=seqlen) print $1"\\t"; }}\' | '
171+
'sort {sort_param} | uniq > {badcigar}'.format(
172+
sam=sam,
173+
sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
174+
badcigar=badcigar,
175+
)
176+
)
171177

172178
# Remove bad CIGAR read pairs
173179
if get_num_lines(badcigar) > 0:

0 commit comments

Comments
 (0)