Skip to content

Commit 6b70b8d

Browse files
authored
Merge pull request #231 from ENCODE-DCC/dev
v1.9.0
2 parents 6921fd6 + ce5b4be commit 6b70b8d

File tree

7 files changed

+152
-74
lines changed

7 files changed

+152
-74
lines changed

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
name: build image
7373
command: |
7474
source ${BASH_ENV}
75-
export DOCKER_CACHE_TAG=v1.4.0.1
75+
export DOCKER_CACHE_TAG=v1.8.1
7676
echo "pulling ${DOCKER_CACHE_TAG}!"
7777
docker pull encodedcc/chip-seq-pipeline:${DOCKER_CACHE_TAG}
7878
docker login -u=${DOCKERHUB_USER} -p=${DOCKERHUB_PASS}

chip.wdl

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
version 1.0
22

33
workflow chip {
4-
String pipeline_ver = 'v1.8.1'
4+
String pipeline_ver = 'v1.9.0'
55

66
meta {
7-
version: 'v1.8.1'
7+
version: 'v1.9.0'
88
author: 'Jin wook Lee (leepc12@gmail.com) at ENCODE-DCC'
99
description: 'ENCODE TF/Histone ChIP-Seq pipeline'
1010
specification_document: 'https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit?usp=sharing'
1111

12-
caper_docker: 'encodedcc/chip-seq-pipeline:v1.8.1'
13-
caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.8.1'
12+
caper_docker: 'encodedcc/chip-seq-pipeline:v1.9.0'
13+
caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.9.0'
1414
croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.v5.json'
1515

1616
parameter_group: {
@@ -162,6 +162,7 @@ workflow chip {
162162
Int xcor_subsample_reads = 15000000
163163
Int xcor_exclusion_range_min = -500
164164
Int? xcor_exclusion_range_max
165+
Int pseudoreplication_random_seed = 0
165166

166167
# group: peak_calling
167168
Int ctl_depth_limit = 200000000
@@ -728,7 +729,11 @@ workflow chip {
728729
group: 'alignment',
729730
help: 'For run_spp.R -s. If not defined default value of `max(read length + 10, 50)` for TF and `max(read_len + 10, 100)` for histone are used'
730731
}
731-
732+
pseudoreplication_random_seed: {
733+
description: 'Random seed (positive integer) used for pseudo-replication (shuffling reads in TAG-ALIGN and then split it into two).',
734+
group: 'alignment',
735+
help: 'Pseudo-replication (task spr) is done by using GNU "shuf --random-source=sha256(random_seed)". If this parameter == 0, then pipeline uses input TAG-ALIGN file\'s size (in bytes) for the random_seed.'
736+
}
732737
ctl_depth_limit: {
733738
description: 'Hard limit for chosen control\'s depth.',
734739
group: 'peak_calling',
@@ -1262,6 +1267,7 @@ workflow chip {
12621267
call spr { input :
12631268
ta = ta_,
12641269
paired_end = paired_end_,
1270+
pseudoreplication_random_seed = pseudoreplication_random_seed,
12651271
mem_factor = spr_mem_factor,
12661272
disk_factor = spr_disk_factor,
12671273
}
@@ -2280,6 +2286,7 @@ task spr {
22802286
input {
22812287
File? ta
22822288
Boolean paired_end
2289+
Int pseudoreplication_random_seed
22832290

22842291
Float mem_factor
22852292
Float disk_factor
@@ -2292,6 +2299,7 @@ task spr {
22922299
set -e
22932300
python3 $(which encode_task_spr.py) \
22942301
${ta} \
2302+
${'--pseudoreplication-random-seed ' + pseudoreplication_random_seed} \
22952303
${if paired_end then '--paired-end' else ''}
22962304
}
22972305
output {

dev/test/test_task/test_spr.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,10 @@
55
"test_spr.ref_pe_ta_pr1" : "chip-seq-pipeline-test-data/ref_output/test_spr/pe/rep1-R1.subsampled.67.merged.nodup.pr1.tagAlign.gz",
66
"test_spr.ref_pe_ta_pr2" : "chip-seq-pipeline-test-data/ref_output/test_spr/pe/rep1-R1.subsampled.67.merged.nodup.pr2.tagAlign.gz",
77
"test_spr.ref_se_ta_pr1" : "chip-seq-pipeline-test-data/ref_output/test_spr/se/rep1.subsampled.25.merged.nodup.pr1.tagAlign.gz",
8-
"test_spr.ref_se_ta_pr2" : "chip-seq-pipeline-test-data/ref_output/test_spr/se/rep1.subsampled.25.merged.nodup.pr2.tagAlign.gz"
8+
"test_spr.ref_se_ta_pr2" : "chip-seq-pipeline-test-data/ref_output/test_spr/se/rep1.subsampled.25.merged.nodup.pr2.tagAlign.gz",
9+
10+
"test_spr.ref_pe_seed_10_ta_pr1" : "chip-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/rep1-R1.subsampled.67.merged.nodup.pr1.tagAlign.gz",
11+
"test_spr.ref_pe_seed_10_ta_pr2" : "chip-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/rep1-R1.subsampled.67.merged.nodup.pr2.tagAlign.gz",
12+
"test_spr.ref_se_seed_10_ta_pr1" : "chip-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/rep1.subsampled.25.merged.nodup.pr1.tagAlign.gz",
13+
"test_spr.ref_se_seed_10_ta_pr2" : "chip-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/rep1.subsampled.25.merged.nodup.pr2.tagAlign.gz"
914
}

dev/test/test_task/test_spr.wdl

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,46 @@ import 'compare_md5sum.wdl' as compare_md5sum
44

55
workflow test_spr {
66
input {
7-
String pe_ta
8-
String se_ta
7+
File pe_ta
8+
File se_ta
99

10-
String ref_pe_ta_pr1
11-
String ref_pe_ta_pr2
12-
String ref_se_ta_pr1
13-
String ref_se_ta_pr2
10+
File ref_pe_ta_pr1
11+
File ref_pe_ta_pr2
12+
File ref_se_ta_pr1
13+
File ref_se_ta_pr2
14+
File ref_pe_seed_10_ta_pr1
15+
File ref_pe_seed_10_ta_pr2
16+
File ref_se_seed_10_ta_pr1
17+
File ref_se_seed_10_ta_pr2
1418
}
1519
Float spr_mem_factor = 0.0
1620
Float spr_disk_factor = 6.0
1721

1822
call chip.spr as pe_spr { input :
1923
ta = pe_ta,
2024
paired_end = true,
21-
25+
pseudoreplication_random_seed = 0,
2226
mem_factor = spr_mem_factor,
2327
disk_factor = spr_disk_factor,
2428
}
2529
call chip.spr as se_spr { input :
2630
ta = se_ta,
2731
paired_end = false,
28-
32+
pseudoreplication_random_seed = 0,
33+
mem_factor = spr_mem_factor,
34+
disk_factor = spr_disk_factor,
35+
}
36+
call chip.spr as pe_spr_seed_10 { input :
37+
ta = pe_ta,
38+
paired_end = true,
39+
pseudoreplication_random_seed = 10,
40+
mem_factor = spr_mem_factor,
41+
disk_factor = spr_disk_factor,
42+
}
43+
call chip.spr as se_spr_seed_10 { input :
44+
ta = se_ta,
45+
paired_end = false,
46+
pseudoreplication_random_seed = 10,
2947
mem_factor = spr_mem_factor,
3048
disk_factor = spr_disk_factor,
3149
}
@@ -36,18 +54,30 @@ workflow test_spr {
3654
'pe_spr_pr2',
3755
'se_spr_pr1',
3856
'se_spr_pr2',
57+
'pe_spr_seed_10_pr1',
58+
'pe_spr_seed_10_pr2',
59+
'se_spr_seed_10_pr1',
60+
'se_spr_seed_10_pr2',
3961
],
4062
files = [
4163
pe_spr.ta_pr1,
4264
pe_spr.ta_pr2,
4365
se_spr.ta_pr1,
4466
se_spr.ta_pr2,
67+
pe_spr_seed_10.ta_pr1,
68+
pe_spr_seed_10.ta_pr2,
69+
se_spr_seed_10.ta_pr1,
70+
se_spr_seed_10.ta_pr2,
4571
],
4672
ref_files = [
4773
ref_pe_ta_pr1,
4874
ref_pe_ta_pr2,
4975
ref_se_ta_pr1,
5076
ref_se_ta_pr2,
77+
ref_pe_seed_10_ta_pr1,
78+
ref_pe_seed_10_ta_pr2,
79+
ref_se_seed_10_ta_pr1,
80+
ref_se_seed_10_ta_pr2,
5181
],
5282
}
5383
}

docs/input.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ Parameter|Type | Description
225225
Parameter|Default|Description
226226
---------|-------|-----------
227227
`chip.filter_chrs` | `[]` (empty array of string) | Array of chromosome names to be filtered out from a final (filtered/nodup) BAM. No chromosomes are filtered out by default.
228+
`chip.pseudoreplication_random_seed` | `0` | Random seed (positive integer) used for pseudo-replication (shuffling reads in TAG-ALIGN and then split it into two). If `0` then TAG-ALIGN file's size (in bytes) is used for random seed.
228229
229230
## Resource parameters
230231

example_input_json/template.full.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"chip.subsample_reads" : 0,
3636
"chip.ctl_subsample_reads" : 0,
3737
"chip.xcor_subsample_reads" : 15000000,
38+
"chip.pseudoreplication_random_seed" : 0,
3839

3940
"chip.xcor_trim_bp" : 50,
4041
"chip.use_filt_pe_ta_for_xcor" : false,

src/encode_task_spr.py

Lines changed: 92 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ def parse_arguments():
1818
help='Path for TAGALIGN file.')
1919
parser.add_argument('--paired-end', action="store_true",
2020
help='Paired-end TAGALIGN.')
21+
parser.add_argument('--pseudoreplication-random-seed',
22+
type=int, default=0,
23+
help='Set it to 0 to use file\'s size (in bytes) as random seed.'
24+
'Otherwise this seed will be used for GNU shuf --random-source=sha256(seed).'
25+
'It is useful when random seed based on input file size does not work.')
2126
parser.add_argument('--out-dir', default='', type=str,
2227
help='Output directory.')
2328
parser.add_argument('--log-level', default='INFO',
@@ -32,7 +37,7 @@ def parse_arguments():
3237
return args
3338

3439

35-
def spr_se(ta, out_dir):
40+
def spr_se(ta, pseudoreplication_random_seed, out_dir):
3641
prefix = os.path.join(out_dir,
3742
os.path.basename(strip_ext_ta(ta)))
3843
tmp_pr1 = '{}.00'.format(prefix)
@@ -41,35 +46,42 @@ def spr_se(ta, out_dir):
4146
ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
4247
nlines = int((get_num_lines(ta)+1)/2)
4348

49+
if pseudoreplication_random_seed == 0:
50+
random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
51+
log.info(
52+
'Using input file\'s size {random_seed} as random seed for pseudoreplication.'.format(
53+
random_seed=random_seed,
54+
)
55+
)
56+
else:
57+
random_seed = pseudoreplication_random_seed
58+
log.info(
59+
'Using a fixed integer {random_seed} as random seed for pseudoreplication.'.format(
60+
random_seed=random_seed,
61+
)
62+
)
63+
4464
# bash-only
45-
cmd1 = 'zcat {} | shuf --random-source=<(openssl enc '
46-
cmd1 += '-aes-256-ctr -pass pass:$(zcat -f {} | wc -c) '
47-
cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
48-
cmd1 += 'split -d -l {} - {}.'
49-
cmd1 = cmd1.format(
50-
ta,
51-
ta,
52-
nlines,
53-
prefix)
54-
run_shell_cmd(cmd1)
55-
56-
cmd2 = 'gzip -nc {} > {}'
57-
cmd2 = cmd2.format(
58-
tmp_pr1,
59-
ta_pr1)
60-
run_shell_cmd(cmd2)
61-
62-
cmd3 = 'gzip -nc {} > {}'
63-
cmd3 = cmd3.format(
64-
tmp_pr2,
65-
ta_pr2)
66-
run_shell_cmd(cmd3)
65+
run_shell_cmd(
66+
'zcat {ta} | shuf --random-source=<(openssl enc '
67+
'-aes-256-ctr -pass pass:{random_seed} '
68+
'-nosalt </dev/zero 2>/dev/null) | '
69+
'split -d -l {nlines} - {prefix}.'.format(
70+
ta=ta,
71+
random_seed=random_seed,
72+
nlines=nlines,
73+
prefix=prefix,
74+
)
75+
)
76+
77+
run_shell_cmd('gzip -nc {tmp_pr1} > {ta_pr1}'.format(tmp_pr1=tmp_pr1, ta_pr1=ta_pr1))
78+
run_shell_cmd('gzip -nc {tmp_pr2} > {ta_pr2}'.format(tmp_pr2=tmp_pr2, ta_pr2=ta_pr2))
6779

6880
rm_f([tmp_pr1, tmp_pr2])
6981
return ta_pr1, ta_pr2
7082

7183

72-
def spr_pe(ta, out_dir):
84+
def spr_pe(ta, pseudoreplication_random_seed, out_dir):
7385
prefix = os.path.join(out_dir,
7486
os.path.basename(strip_ext_ta(ta)))
7587
tmp_pr1 = '{}.00'.format(prefix)
@@ -78,40 +90,57 @@ def spr_pe(ta, out_dir):
7890
ta_pr2 = '{}.pr2.tagAlign.gz'.format(prefix)
7991
nlines = int((get_num_lines(ta)/2+1)/2)
8092

93+
if pseudoreplication_random_seed == 0:
94+
random_seed = run_shell_cmd('zcat -f {ta} | wc -c'.format(ta=ta))
95+
log.info(
96+
'Using input file\'s size {random_seed} as random seed for pseudoreplication.'.format(
97+
random_seed=random_seed,
98+
)
99+
)
100+
else:
101+
random_seed = pseudoreplication_random_seed
102+
log.info(
103+
'Using a fixed integer {random_seed} as random seed for pseudoreplication.'.format(
104+
random_seed=random_seed,
105+
)
106+
)
107+
81108
# bash-only
82-
cmd1 = 'zcat -f {} | sed \'N;s/\\n/\\t/\' | '
83-
cmd1 += 'shuf --random-source=<(openssl enc -aes-256-ctr '
84-
cmd1 += '-pass pass:$(zcat -f {} | wc -c) '
85-
cmd1 += '-nosalt </dev/zero 2>/dev/null) | '
86-
cmd1 += 'split -d -l {} - {}.'
87-
cmd1 = cmd1.format(
88-
ta,
89-
ta,
90-
nlines,
91-
prefix)
92-
run_shell_cmd(cmd1)
93-
94-
cmd2 = 'zcat -f {} | '
95-
cmd2 += 'awk \'BEGIN{{OFS="\\t"}} '
96-
cmd2 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
97-
cmd2 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
98-
cmd2 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
99-
cmd2 += 'gzip -nc > {}'
100-
cmd2 = cmd2.format(
101-
tmp_pr1,
102-
ta_pr1)
103-
run_shell_cmd(cmd2)
104-
105-
cmd3 = 'zcat -f {} | '
106-
cmd3 += 'awk \'BEGIN{{OFS="\\t"}} '
107-
cmd3 += '{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
108-
cmd3 += '%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
109-
cmd3 += '$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
110-
cmd3 += 'gzip -nc > {}'
111-
cmd3 = cmd3.format(
112-
tmp_pr2,
113-
ta_pr2)
114-
run_shell_cmd(cmd3)
109+
run_shell_cmd(
110+
'zcat -f {ta} | sed \'N;s/\\n/\\t/\' | '
111+
'shuf --random-source=<(openssl enc -aes-256-ctr '
112+
'-pass pass:{random_seed} -nosalt </dev/zero 2>/dev/null) | '
113+
'split -d -l {nlines} - {prefix}.'.format(
114+
ta=ta,
115+
random_seed=random_seed,
116+
nlines=nlines,
117+
prefix=prefix,
118+
)
119+
)
120+
121+
run_shell_cmd(
122+
'zcat -f {tmp_pr1} | '
123+
'awk \'BEGIN{{OFS="\\t"}} '
124+
'{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
125+
'%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
126+
'$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
127+
'gzip -nc > {ta_pr1}'.format(
128+
tmp_pr1=tmp_pr1,
129+
ta_pr1=ta_pr1,
130+
)
131+
)
132+
133+
run_shell_cmd(
134+
'zcat -f {tmp_pr2} | '
135+
'awk \'BEGIN{{OFS="\\t"}} '
136+
'{{printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n'
137+
'%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n",'
138+
'$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}}\' | '
139+
'gzip -nc > {ta_pr2}'.format(
140+
tmp_pr2=tmp_pr2,
141+
ta_pr2=ta_pr2,
142+
)
143+
)
115144

116145
rm_f([tmp_pr1, tmp_pr2])
117146
return ta_pr1, ta_pr2
@@ -125,9 +154,13 @@ def main():
125154

126155
log.info('Making self-pseudo replicates...')
127156
if args.paired_end:
128-
ta_pr1, ta_pr2 = spr_pe(args.ta, args.out_dir)
157+
ta_pr1, ta_pr2 = spr_pe(
158+
args.ta, args.pseudoreplication_random_seed, args.out_dir,
159+
)
129160
else:
130-
ta_pr1, ta_pr2 = spr_se(args.ta, args.out_dir)
161+
ta_pr1, ta_pr2 = spr_se(
162+
args.ta, args.pseudoreplication_random_seed, args.out_dir,
163+
)
131164

132165
log.info('List all files in output directory...')
133166
ls_l(args.out_dir)

0 commit comments

Comments
 (0)