From 75e0f904cbf4c04a73b5e96bb6dc949993e381a3 Mon Sep 17 00:00:00 2001 From: sitag Date: Tue, 12 Nov 2019 00:53:21 -0500 Subject: [PATCH 01/45] Update readme.md --- readme.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/readme.md b/readme.md index c6838f6..4e3c289 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,9 @@ # IHEC wrapper for ENCODE ChIP-Seq pipeline. +This is the IHEC wrapper around the ENCODE ChIP-Seq pipeline that captures that IHEC specific configurations for analyzing ChIP-Seq data through the ENCODE pipeline. + +For an introduction to the ENCODE ChIP-Seq pipeline see: (ENCODE-DCC/chip-seq-pipeline2)[https://github.com/ENCODE-DCC/chip-seq-pipeline2] + Please see the documentation available [here](encode-wrapper/readme.md) to set up and test the pipeline. Also refer to IHEC standard [workflows](ihec_standard_workflow.md) From 05083fa2a033c78ea64994f37fca401def2d4533 Mon Sep 17 00:00:00 2001 From: sitag Date: Tue, 12 Nov 2019 00:54:02 -0500 Subject: [PATCH 02/45] Update readme.md --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 4e3c289..f818d78 100644 --- a/readme.md +++ b/readme.md @@ -2,7 +2,7 @@ This is the IHEC wrapper around the ENCODE ChIP-Seq pipeline that captures that IHEC specific configurations for analyzing ChIP-Seq data through the ENCODE pipeline. -For an introduction to the ENCODE ChIP-Seq pipeline see: (ENCODE-DCC/chip-seq-pipeline2)[https://github.com/ENCODE-DCC/chip-seq-pipeline2] +For an introduction to the ENCODE ChIP-Seq pipeline see: [ENCODE-DCC/chip-seq-pipeline2](https://github.com/ENCODE-DCC/chip-seq-pipeline2) Please see the documentation available [here](encode-wrapper/readme.md) to set up and test the pipeline. From 5cf17c37dc3bb95a5d108e3d989538c3860d5f64 Mon Sep 17 00:00:00 2001 From: sitag Date: Wed, 1 May 2019 00:29:30 -0700 Subject: [PATCH 03/45] file tracking --- encode-wrapper/trackoutput.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index a3944b1..0c6406c 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -18,9 +18,13 @@ def byino(fs): if not ino in hashed: hashed[ino] = list() hashed[ino].append(e) hashed2 = dict() + full_flist = dict() for k,v in hashed.items(): - hashed2[k] = sorted(v, key = lambda x: os.path.basename(x))[0] - return hashed2 + sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) + hashed2[k] = sortedfiles[0] + assert not sortedfiles[0] in full_flist + full_flist[sortedfiles[0]] = sortedfiles[1:] + return (hashed2, full_flist) def md5script(hashed): def cmd(f): @@ -31,22 +35,34 @@ def cmd(f): return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] -def trackoutput(base, i): +def trackoutput(base, i, filereport): logerr('# looking in {0}\n'.format(base)) bams = findfiles(base, '*.bam') narrowpeaks = findfiles(base, '*narrow*gz') - bamsbyino = byino(bams) - peaksbyino = byino(narrowpeaks) - print writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino)) - print findfiles(base, 'qc.html') + (bamsbyino, bams_flist) = byino(bams) + (peaksbyino, peaks_flist) = byino(narrowpeaks) + + if not filereport: + print writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino)) + + qc = findfiles(base, 'qc.html') + print qc + + return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : qc } + + def main(args): + filereport = '-filereport' in args + args = [e for e in args if not e in ['-filereport'] ] targets = ['.'] if len(args) == 0 else args + output = list() for i, arg in enumerate(targets): - trackoutput(arg, i) - + output.append(trackoutput(arg, i, filereport)) + if targets: + print jdumpf('./filereport.json', output) if __name__ == '__main__': From b56a763390f4b778dd4fc36035990b1f4c987d3a Mon Sep 17 00:00:00 2001 From: sitag Date: Fri, 3 May 2019 01:45:48 -0700 Subject: [PATCH 04/45] track all encode extensions --- encode-wrapper/patterns.json | 52 +++++++++++++++++++++++++++++++++++ encode-wrapper/trackoutput.py | 37 ++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 encode-wrapper/patterns.json diff --git a/encode-wrapper/patterns.json b/encode-wrapper/patterns.json new file mode 100644 index 0000000..7bbed6d --- /dev/null +++ b/encode-wrapper/patterns.json @@ -0,0 +1,52 @@ +[ + "*merge*fastq*R*.fastq.gz", + "*.trim*bp.fastq.gz", + "*.bam", + "*.bai", + "*.flagstat.qc", + "*.nodup.bam", + "*.nodup.flagstat.qc", + "*.dup.qc", + "*.pbc.qc", + "*.tagAlign.gz", + "*.N.tagAlign.gz", + "*.tn5.tagAlign.gz", + "*.pr1.tagAlign.gz", + "*.pr2.tagAlign.gz", + "*.tagAlign.gz", + "*.jsd.qc", + "*.png", + "*ctl_for_rep*.tagAlign.gz", + "*.cc.plot.pdf", + "*.cc.plot.png", + "*.cc.qc", + "*.cc.fraglen.txt", + "*.narrowPeak.gz", + "*.bfilt.narrowPeak.gz", + "*.bfilt.narrowPeak.bb", + "*.pval.signal.bigwig", + "*.fc.signal.bigwig", + "*.frip.qc", + "*.regionPeak.gz", + "*.bfilt.regionPeak.gz", + "*.bfilt.regionPeak.bb", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.txt.png", + "*.txt.gz", + "*.log", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.frip.qc", + "*.reproducibility.qc", + "*optimal_peak.gz", + "*optimal_peak.bb", + "*conservative_peak.gz", + "*conservative_peak.bb", + "*qc.html", + "*qc.json" +] \ No newline at end of file diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index 0c6406c..89593b9 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -11,6 +11,9 @@ def findfiles(base, pattern): return [e.strip() for e in p.stdout.readlines()] +def flistsize(fs): + return {e : os.stat(e).st_size for e in fs} + def byino(fs): hashed = dict() for e in fs: @@ -52,18 +55,44 @@ def trackoutput(base, i, filereport): +def make_filereport(patterns, base): + logerr('# looking in {0}\n'.format(base)) + fbyino, flist = dict(), dict() + for p in patterns: + found = findfiles(base, p) + (a, b) = byino(found) + fbyino[p] = a + flist[p] = b + return {'byino' : fbyino, 'flist':flist} def main(args): filereport = '-filereport' in args args = [e for e in args if not e in ['-filereport'] ] + assert len(args) == 1 targets = ['.'] if len(args) == 0 else args output = list() + keep = list() + + patterns = jloadf('patterns.json') for i, arg in enumerate(targets): - output.append(trackoutput(arg, i, filereport)) - if targets: - print jdumpf('./filereport.json', output) - + if filereport: + record = make_filereport(patterns, arg) + output.append(record) + for p in patterns: + print p, sum(flistsize(record['flist'][p]).values())/10**9, 'GB approx' + keep.extend(record['flist'][p]) + else: + record = trackoutput(arg, i, filereport) + output.append(record) + keep.extend(record['bams']) + keep.extend(record['peaks']) + + print jdumpf('./filereport.json', output) + print writef('./keep.filelist', keep) + print 'size', sum(flistsize(keep).values()) + + if __name__ == '__main__': main(sys.argv[1:]) From a71e2f892debc12ca433d826e5190558cdd9d288 Mon Sep 17 00:00:00 2001 From: sitag Date: Sun, 28 Jul 2019 18:21:56 -0700 Subject: [PATCH 05/45] cleanup config --- encode-wrapper/chip.py | 1 + encode-wrapper/cleanup.json | 60 +++++++++++++++++++++++++ encode-wrapper/computemd5s.py | 82 +++++++++++++++++++++++++++++++++++ encode-wrapper/patterns.json | 6 ++- encode-wrapper/trackoutput.py | 56 ++++++++++++++---------- 5 files changed, 179 insertions(+), 26 deletions(-) create mode 100644 encode-wrapper/cleanup.json create mode 100644 encode-wrapper/computemd5s.py diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index c14564f..7cb3821 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -134,6 +134,7 @@ def write_testrun(config): logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(config['home']), infile.read().format(**config)) + '\n') logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} "${{1:-Local}}" ${{@:2}}\n\n'.format(**config))) + logerrn(dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $?\n\n'.format(**config))) return dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/piperunner.sh {home_mnt} $1 $BACKEND\n\n'.format(**config)) diff --git a/encode-wrapper/cleanup.json b/encode-wrapper/cleanup.json new file mode 100644 index 0000000..4013873 --- /dev/null +++ b/encode-wrapper/cleanup.json @@ -0,0 +1,60 @@ +{ + +"delete": ["*.fastq.gz", "*.fastq", "*fasta.tar", "*.tagAlign.gz"], + + +"patterns":[ + "*.bb", "*.bw", "*.bigwig", "*.bigbed", "*narrowPeak.hammock.gz*", "*.badReads", "*fasta.tar", + "*.fastq.gz", "*.fastq", + "*merge*fastq*R*.fastq.gz", + "*.trim*bp.fastq.gz", + "*.bam", + "*.bai", + "*.flagstat.qc", + "*.nodup.bam", + "*.nodup.flagstat.qc", + "*.dup.qc", + "*.pbc.qc", + "*.tagAlign.gz", + "*.N.tagAlign.gz", + "*.tn5.tagAlign.gz", + "*.pr1.tagAlign.gz", + "*.pr2.tagAlign.gz", + "*.tagAlign.gz", + "*.jsd.qc", + "*.png", + "*ctl_for_rep*.tagAlign.gz", + "*.cc.plot.pdf", + "*.cc.plot.png", + "*.cc.qc", + "*.cc.fraglen.txt", + "*.narrowPeak.gz", + "*.bfilt.narrowPeak.gz", + "*.bfilt.narrowPeak.bb", + "*.pval.signal.bigwig", + "*.fc.signal.bigwig", + "*.frip.qc", + "*.regionPeak.gz", + "*.bfilt.regionPeak.gz", + "*.bfilt.regionPeak.bb", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.txt.png", + "*.txt.gz", + "*.log", "*.txt", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.frip.qc", + "*.reproducibility.qc", + "*optimal_peak.gz", + "*optimal_peak.bb", + "*conservative_peak.gz", + "*conservative_peak.bb", + "*qc.html", + "*qc.json" +] +} diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py new file mode 100644 index 0000000..ce6b7cd --- /dev/null +++ b/encode-wrapper/computemd5s.py @@ -0,0 +1,82 @@ +from utilsm import * +import os +import sys + + + +def findfiles(base, pattern): + cmd = "find {0} -name '{1}'".format(base, pattern) + print cmd + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return [e.strip() for e in p.stdout.readlines()] + + +def flistsize(fs): + return {e : os.stat(e).st_size for e in fs} + +def byino(fs): + hashed = dict() + for e in fs: + ino = os.stat(e).st_ino + if not ino in hashed: hashed[ino] = list() + hashed[ino].append(e) + hashed2 = dict() + full_flist = dict() + for k,v in hashed.items(): + sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) + hashed2[k] = sortedfiles[0] + assert not sortedfiles[0] in full_flist + full_flist[sortedfiles[0]] = sortedfiles[1:] + return (hashed2, full_flist) + +def md5script(hashed): + def cmd(f): + if f.strip().endswith('bam'): + return 'echo "{1} $(./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) + else: + return 'echo "{1} $(md5sum {0})"'.format(f, os.path.basename(f)) + + return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] + +def trackoutput(base, i, filereport): + logerr('# looking in {0}\n'.format(base)) + bams = findfiles(base, '*.bam') + narrowpeaks = findfiles(base, '*narrow*gz') + (bamsbyino, bams_flist) = byino(bams) + (peaksbyino, peaks_flist) = byino(narrowpeaks) + + if not filereport: + print writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino)) + + qc = findfiles(base, 'qc.html') + print qc + + return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1]} + + + + + +def main(args): + assert len(args) == 1, '__only_one_target_directory_at_a_time__' + targets = ['.'] if len(args) == 0 else args + output = list() + keep = list() + + for i, arg in enumerate(targets): + record = trackoutput(arg, i, False) + output.append(record) + keep.extend(record['bams']) + keep.extend(record['peaks']) + + print jdumpf('./filereport.json', output) + print jdumpf('./file_shortreport.json', map(lambda o: {k: sorted(o[k].keys()) for k in o}, output)) + print 'size', sum(flistsize(keep).values()) + + + +if __name__ == '__main__': + main(sys.argv[1:]) + + + diff --git a/encode-wrapper/patterns.json b/encode-wrapper/patterns.json index 7bbed6d..08346c5 100644 --- a/encode-wrapper/patterns.json +++ b/encode-wrapper/patterns.json @@ -1,4 +1,6 @@ [ + "*.bb", "*.bw", "*.bigwig", "*.bigbed", "*narrowPeak.hammock.gz*", "*.badReads", "*fasta.tar", + "*.fastq.gz", "*.fastq", "*merge*fastq*R*.fastq.gz", "*.trim*bp.fastq.gz", "*.bam", @@ -36,7 +38,7 @@ "*.bfilt.*Peak.bb", "*.txt.png", "*.txt.gz", - "*.log", + "*.log", "*.txt", "*.frip.qc", "*.*Peak.gz", "*.bfilt.*Peak.gz", @@ -49,4 +51,4 @@ "*conservative_peak.bb", "*qc.html", "*qc.json" -] \ No newline at end of file +] diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index 89593b9..76e51df 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -5,11 +5,15 @@ def findfiles(base, pattern): - cmd = "find {0} -name '{1}'".format(base, pattern) + cmd = "find {0} -iname '{1}'".format(base, pattern) print cmd p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) return [e.strip() for e in p.stdout.readlines()] +def listfiles(base): + p = subprocess.Popen("find {0} -type f".format(base), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return [e.strip() for e in p.stdout.readlines()] + def flistsize(fs): return {e : os.stat(e).st_size for e in fs} @@ -51,7 +55,7 @@ def trackoutput(base, i, filereport): qc = findfiles(base, 'qc.html') print qc - return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : qc } + return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1] } @@ -67,30 +71,34 @@ def make_filereport(patterns, base): def main(args): - filereport = '-filereport' in args - args = [e for e in args if not e in ['-filereport'] ] - assert len(args) == 1 - targets = ['.'] if len(args) == 0 else args - output = list() - keep = list() + filereport = True + assert len(args) == 1, '__only_one_target_directory_at_a_time__' + + arg = args[0] + + config = jloadf('cleanup.json') + patterns = config["patterns"] + assert not "extra" in patterns + output = make_filereport(patterns, arg) + short_out = {k: sorted(output['flist'][k].keys()) for k in output['flist']} + allfiles = listfiles(arg) + + patternfiles = list() + for p in patterns: + record = output + print p, sum(flistsize(record['flist'][p]).values())/10**9, 'GB approx' + patternfiles.extend(record['flist'][p].keys() + [e for k in record['flist'][p] for e in record['flist'][p][k]] ) + - patterns = jloadf('patterns.json') - for i, arg in enumerate(targets): - if filereport: - record = make_filereport(patterns, arg) - output.append(record) - for p in patterns: - print p, sum(flistsize(record['flist'][p]).values())/10**9, 'GB approx' - keep.extend(record['flist'][p]) - else: - record = trackoutput(arg, i, filereport) - output.append(record) - keep.extend(record['bams']) - keep.extend(record['peaks']) - + extra = [f for f in allfiles if not f in patternfiles] + print jdumpf('./filereport.json', output) - print writef('./keep.filelist', keep) - print 'size', sum(flistsize(keep).values()) + print jdumpf('./file_shortreport.json', short_out) + + + print jdumpf('./unrecognized_files.json', extra) + + #print 'size', sum(flistsize(keep).values()) From e334f1b43519ca32de60b924dd72265fb1f4a91a Mon Sep 17 00:00:00 2001 From: sitag Date: Sun, 28 Jul 2019 20:18:47 -0700 Subject: [PATCH 06/45] cleanup filt.bam + master filelist --- encode-wrapper/cleanup.json | 136 +++++++++++++++++++--------------- encode-wrapper/trackoutput.py | 21 +++++- 2 files changed, 96 insertions(+), 61 deletions(-) diff --git a/encode-wrapper/cleanup.json b/encode-wrapper/cleanup.json index 4013873..05d2fc0 100644 --- a/encode-wrapper/cleanup.json +++ b/encode-wrapper/cleanup.json @@ -1,60 +1,78 @@ { - -"delete": ["*.fastq.gz", "*.fastq", "*fasta.tar", "*.tagAlign.gz"], - - -"patterns":[ - "*.bb", "*.bw", "*.bigwig", "*.bigbed", "*narrowPeak.hammock.gz*", "*.badReads", "*fasta.tar", - "*.fastq.gz", "*.fastq", - "*merge*fastq*R*.fastq.gz", - "*.trim*bp.fastq.gz", - "*.bam", - "*.bai", - "*.flagstat.qc", - "*.nodup.bam", - "*.nodup.flagstat.qc", - "*.dup.qc", - "*.pbc.qc", - "*.tagAlign.gz", - "*.N.tagAlign.gz", - "*.tn5.tagAlign.gz", - "*.pr1.tagAlign.gz", - "*.pr2.tagAlign.gz", - "*.tagAlign.gz", - "*.jsd.qc", - "*.png", - "*ctl_for_rep*.tagAlign.gz", - "*.cc.plot.pdf", - "*.cc.plot.png", - "*.cc.qc", - "*.cc.fraglen.txt", - "*.narrowPeak.gz", - "*.bfilt.narrowPeak.gz", - "*.bfilt.narrowPeak.bb", - "*.pval.signal.bigwig", - "*.fc.signal.bigwig", - "*.frip.qc", - "*.regionPeak.gz", - "*.bfilt.regionPeak.gz", - "*.bfilt.regionPeak.bb", - "*.frip.qc", - "*.*Peak.gz", - "*.bfilt.*Peak.gz", - "*.bfilt.*Peak.bb", - "*.txt.png", - "*.txt.gz", - "*.log", "*.txt", - "*.frip.qc", - "*.*Peak.gz", - "*.bfilt.*Peak.gz", - "*.bfilt.*Peak.bb", - "*.frip.qc", - "*.reproducibility.qc", - "*optimal_peak.gz", - "*optimal_peak.bb", - "*conservative_peak.gz", - "*conservative_peak.bb", - "*qc.html", - "*qc.json" -] -} + "delete": [ + "*.fastq.gz", + "*.fastq", + "*fasta.tar", + "*.tagAlign.gz", + "*trim_50bp.bam.bai", + "*trim_50bp.bam", + "*filt.bam", + "*filt.bam.bai" + ], + "patterns": [ + "*trim_50bp.bam.bai", + "*trim_50bp.bam", + "*filt.bam", + "*filt.bam.bai", + "*.bb", + "*.bw", + "*.bigwig", + "*.bigbed", + "*narrowPeak.hammock.gz*", + "*.badReads", + "*fasta.tar", + "*.fastq.gz", + "*.fastq", + "*merge*fastq*R*.fastq.gz", + "*.trim*bp.fastq.gz", + "*.bam", + "*.bai", + "*.flagstat.qc", + "*.nodup.bam", + "*.nodup.flagstat.qc", + "*.dup.qc", + "*.pbc.qc", + "*.tagAlign.gz", + "*.N.tagAlign.gz", + "*.tn5.tagAlign.gz", + "*.pr1.tagAlign.gz", + "*.pr2.tagAlign.gz", + "*.tagAlign.gz", + "*.jsd.qc", + "*.png", + "*ctl_for_rep*.tagAlign.gz", + "*.cc.plot.pdf", + "*.cc.plot.png", + "*.cc.qc", + "*.cc.fraglen.txt", + "*.narrowPeak.gz", + "*.bfilt.narrowPeak.gz", + "*.bfilt.narrowPeak.bb", + "*.pval.signal.bigwig", + "*.fc.signal.bigwig", + "*.frip.qc", + "*.regionPeak.gz", + "*.bfilt.regionPeak.gz", + "*.bfilt.regionPeak.bb", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.txt.png", + "*.txt.gz", + "*.log", + "*.txt", + "*.frip.qc", + "*.*Peak.gz", + "*.bfilt.*Peak.gz", + "*.bfilt.*Peak.bb", + "*.frip.qc", + "*.reproducibility.qc", + "*optimal_peak.gz", + "*optimal_peak.bb", + "*conservative_peak.gz", + "*conservative_peak.bb", + "*qc.html", + "*qc.json" + ] +} \ No newline at end of file diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index 76e51df..d9e69a9 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -75,7 +75,7 @@ def main(args): assert len(args) == 1, '__only_one_target_directory_at_a_time__' arg = args[0] - + config = jloadf('cleanup.json') patterns = config["patterns"] assert not "extra" in patterns @@ -83,6 +83,9 @@ def main(args): short_out = {k: sorted(output['flist'][k].keys()) for k in output['flist']} allfiles = listfiles(arg) + + + patternfiles = list() for p in patterns: record = output @@ -95,7 +98,21 @@ def main(args): print jdumpf('./filereport.json', output) print jdumpf('./file_shortreport.json', short_out) - + rmlist = list() + for ftype in config["delete"]: + for k in record['flist'][ftype]: + rmlist.extend(record['flist'][ftype][k]) + rmlist.append(k) + + keep = list() + for k in record['flist']: + if not k in config['delete']: + keep.extend([e for e in record['flist'][k].keys() if not e in rmlist]) + keep = sorted(list(set(keep))) + + + print dumpf('./delete.list', '\n'.join(rmlist) + '\n') + print dumpf('./masterfiles.list', '\n'.join(keep) + '\n') print jdumpf('./unrecognized_files.json', extra) #print 'size', sum(flistsize(keep).values()) From d9c3b44d2804802e45122474d383cc1da71e1092 Mon Sep 17 00:00:00 2001 From: sitag Date: Sun, 28 Jul 2019 20:30:38 -0700 Subject: [PATCH 07/45] change md5 script generation --- encode-wrapper/computemd5s.py | 8 ++++---- encode-wrapper/readme.md | 12 +++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py index ce6b7cd..a567407 100644 --- a/encode-wrapper/computemd5s.py +++ b/encode-wrapper/computemd5s.py @@ -58,13 +58,13 @@ def trackoutput(base, i, filereport): def main(args): - assert len(args) == 1, '__only_one_target_directory_at_a_time__' - targets = ['.'] if len(args) == 0 else args + assert len(args) == 2, '__only_one_target_directory_at_a_time__' + [targets, tag] = args #['.'] if len(args) == 0 else args output = list() keep = list() - for i, arg in enumerate(targets): - record = trackoutput(arg, i, False) + for i, arg in enumerate([targets]): + record = trackoutput(arg, tag, False) output.append(record) keep.extend(record['bams']) keep.extend(record['peaks']) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index e1c0aa9..8428272 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -34,6 +34,8 @@ This will write: * singularity_wrapper.sh +* trackoutput.sh + If you are running in `Local` mode using using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_0`, `$PWD/data_a` as `/mnt/ext_1` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and shift other bind points further along `ext_$i`'s. For example, @@ -91,13 +93,13 @@ IHEC tests can be run with: The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. Currently the only local mode is supported for singularity. The ENCODE documentation discusses other modes. -To compute md5s of generated file, use `trackoutput.py ...`. This will locate peak calls and bam files, and generate scripts `computemd5s_$i` to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. +To compute md5s of generated file, use `computemd5s.py `. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `$PWD/h3k4me3_out`. So do - python trackoutput.py $PWD/h3k4me3_out - chmod +x ./computemd5s_0 - ./computemd5s_0 > log_h3k4me3 + python computemd5s.py $PWD/h3k4me3_out _test + chmod +x ./computemd5s_test + ./computemd5s_test > log_h3k4me3 python status_cemt.py log_h3k4me3 expected_md5s_h3k4me3.json This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. @@ -124,7 +126,7 @@ This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. } - +See output of `./trackoutput.sh ` to see what files are to be copied over. This list will be in `masterfiles.list` From ad5f03f3f9223d8e2d3e5304fea2bac63a773cbf Mon Sep 17 00:00:00 2001 From: sitag Date: Wed, 31 Jul 2019 19:52:06 -0700 Subject: [PATCH 08/45] add unrecognized file patterns --- encode-wrapper/cleanup.json | 18 ++++++++- encode-wrapper/encode_test_tasks_run.sh | 2 +- encode-wrapper/patterns.json | 54 ------------------------- encode-wrapper/trackoutput.py | 39 +++++++----------- 4 files changed, 31 insertions(+), 82 deletions(-) delete mode 100644 encode-wrapper/patterns.json diff --git a/encode-wrapper/cleanup.json b/encode-wrapper/cleanup.json index 05d2fc0..7ae3198 100644 --- a/encode-wrapper/cleanup.json +++ b/encode-wrapper/cleanup.json @@ -74,5 +74,19 @@ "*conservative_peak.bb", "*qc.html", "*qc.json" - ] -} \ No newline at end of file + ], + "extraneous": [ "glob-.*\\.list", + "hg38\\.blacklist\\.bed\\.gz", + "hg38\\.chrom\\.sizes", + "hg38_local\\.tsv", + "null", "rc", "script", "script\\.background", + "script\\.submit", + ".*java.so", + "stderr", + "stderr\\.background", + "stdout", + "stdout\\.background", + "tmp\\.tsv", + "write_tsv_.*\\.tmp", "cromwell_glob_control_file" + ] +} diff --git a/encode-wrapper/encode_test_tasks_run.sh b/encode-wrapper/encode_test_tasks_run.sh index 61692cc..f51a98c 100755 --- a/encode-wrapper/encode_test_tasks_run.sh +++ b/encode-wrapper/encode_test_tasks_run.sh @@ -10,7 +10,7 @@ chmod +x $H/testrun_tasks.sh testsOut=$H/test_tasks_results_"$tag" mkdir $testsOut || true cd $BASE/chip-seq-pipeline2/test/test_task -echo "__container__:$BASE,$BACKEND,$PWD" +echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do #for t in test_bam2ta; do diff --git a/encode-wrapper/patterns.json b/encode-wrapper/patterns.json deleted file mode 100644 index 08346c5..0000000 --- a/encode-wrapper/patterns.json +++ /dev/null @@ -1,54 +0,0 @@ -[ - "*.bb", "*.bw", "*.bigwig", "*.bigbed", "*narrowPeak.hammock.gz*", "*.badReads", "*fasta.tar", - "*.fastq.gz", "*.fastq", - "*merge*fastq*R*.fastq.gz", - "*.trim*bp.fastq.gz", - "*.bam", - "*.bai", - "*.flagstat.qc", - "*.nodup.bam", - "*.nodup.flagstat.qc", - "*.dup.qc", - "*.pbc.qc", - "*.tagAlign.gz", - "*.N.tagAlign.gz", - "*.tn5.tagAlign.gz", - "*.pr1.tagAlign.gz", - "*.pr2.tagAlign.gz", - "*.tagAlign.gz", - "*.jsd.qc", - "*.png", - "*ctl_for_rep*.tagAlign.gz", - "*.cc.plot.pdf", - "*.cc.plot.png", - "*.cc.qc", - "*.cc.fraglen.txt", - "*.narrowPeak.gz", - "*.bfilt.narrowPeak.gz", - "*.bfilt.narrowPeak.bb", - "*.pval.signal.bigwig", - "*.fc.signal.bigwig", - "*.frip.qc", - "*.regionPeak.gz", - "*.bfilt.regionPeak.gz", - "*.bfilt.regionPeak.bb", - "*.frip.qc", - "*.*Peak.gz", - "*.bfilt.*Peak.gz", - "*.bfilt.*Peak.bb", - "*.txt.png", - "*.txt.gz", - "*.log", "*.txt", - "*.frip.qc", - "*.*Peak.gz", - "*.bfilt.*Peak.gz", - "*.bfilt.*Peak.bb", - "*.frip.qc", - "*.reproducibility.qc", - "*optimal_peak.gz", - "*optimal_peak.bb", - "*conservative_peak.gz", - "*conservative_peak.bb", - "*qc.html", - "*qc.json" -] diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index d9e69a9..0c58cc6 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -1,9 +1,20 @@ from utilsm import * import os +import re import sys +def check_extraneous(patterns, flist): + regex = [re.compile(e) for e in patterns] + def matched(x): + for e in regex: + if e.findall(x): return True + return False + unexpected = [f for f in flist if not matched(os.path.basename(f))] + return unexpected + + def findfiles(base, pattern): cmd = "find {0} -iname '{1}'".format(base, pattern) print cmd @@ -33,31 +44,6 @@ def byino(fs): full_flist[sortedfiles[0]] = sortedfiles[1:] return (hashed2, full_flist) -def md5script(hashed): - def cmd(f): - if f.strip().endswith('bam'): - return 'echo "{1} $(./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) - else: - return 'echo "{1} $(md5sum {0})"'.format(f, os.path.basename(f)) - - return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] - -def trackoutput(base, i, filereport): - logerr('# looking in {0}\n'.format(base)) - bams = findfiles(base, '*.bam') - narrowpeaks = findfiles(base, '*narrow*gz') - (bamsbyino, bams_flist) = byino(bams) - (peaksbyino, peaks_flist) = byino(narrowpeaks) - - if not filereport: - print writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino)) - - qc = findfiles(base, 'qc.html') - print qc - - return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1] } - - def make_filereport(patterns, base): logerr('# looking in {0}\n'.format(base)) @@ -115,6 +101,9 @@ def main(args): print dumpf('./masterfiles.list', '\n'.join(keep) + '\n') print jdumpf('./unrecognized_files.json', extra) + unexpected = check_extraneous(config["extraneous"], extra) + print "unexpected", unexpected + #print 'size', sum(flistsize(keep).values()) From 01b541b7f082a8d2bec931b91d033d18ac990f0a Mon Sep 17 00:00:00 2001 From: sitag Date: Thu, 1 Aug 2019 12:33:28 -0700 Subject: [PATCH 09/45] cleaup updates --- encode-wrapper/chip.py | 2 +- encode-wrapper/cleanup.json | 2 ++ encode-wrapper/readme.md | 15 ++++++--------- encode-wrapper/trackoutput.py | 34 ++++++++++++++++++++++++++-------- initpipe | 10 ++++++++++ 5 files changed, 45 insertions(+), 18 deletions(-) create mode 100755 initpipe diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 7cb3821..c2bde7a 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -134,7 +134,7 @@ def write_testrun(config): logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(config['home']), infile.read().format(**config)) + '\n') logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} "${{1:-Local}}" ${{@:2}}\n\n'.format(**config))) - logerrn(dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $?\n\n'.format(**config))) + logerrn(dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**config))) return dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/piperunner.sh {home_mnt} $1 $BACKEND\n\n'.format(**config)) diff --git a/encode-wrapper/cleanup.json b/encode-wrapper/cleanup.json index 7ae3198..71341dc 100644 --- a/encode-wrapper/cleanup.json +++ b/encode-wrapper/cleanup.json @@ -1,5 +1,6 @@ { "delete": [ + "*.fq.gz", "*.fastq.gz", "*.fastq", "*fasta.tar", @@ -10,6 +11,7 @@ "*filt.bam.bai" ], "patterns": [ + "*.fq.gz", "*trim_50bp.bam.bai", "*trim_50bp.bam", "*filt.bam", diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 8428272..f1e7185 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -126,15 +126,12 @@ This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. } -See output of `./trackoutput.sh ` to see what files are to be copied over. This list will be in `masterfiles.list` - - - - - - - - +See output of `./trackoutput.sh ` to see what files are to be copied over. `trackoutput.sh` will write following lists of files: + ./delete.list # files okay to delete + ./masterfiles.list # files that will be kept + ./extraneous_cromwell.list # files that are likely extraneous cromwell files + ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved + ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index 0c58cc6..14f1663 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -14,6 +14,12 @@ def matched(x): unexpected = [f for f in flist if not matched(os.path.basename(f))] return unexpected +def check_fileskept(keep): + for f in keep: + stats = os.stat(f) # this fill choke if there are issue with resolving hardlinking + print '#keeping... ', os.path.basename(f), 'size', stats.st_size , 'ino', stats.st_ino + + def findfiles(base, pattern): cmd = "find {0} -iname '{1}'".format(base, pattern) @@ -31,8 +37,15 @@ def flistsize(fs): def byino(fs): hashed = dict() + negino = -1 for e in fs: - ino = os.stat(e).st_ino + try: + ino = os.stat(e).st_ino + except OSError as err: + logerr('# WARN.. {0}\n'.format(str(err))) + ino = negino # use a new negative ino for each unresolved ino + negino = negino - 1 + if not ino in hashed: hashed[ino] = list() hashed[ino].append(e) hashed2 = dict() @@ -62,7 +75,7 @@ def main(args): arg = args[0] - config = jloadf('cleanup.json') + config = jloadf('./cleanup.json') patterns = config["patterns"] assert not "extra" in patterns output = make_filereport(patterns, arg) @@ -73,11 +86,12 @@ def main(args): patternfiles = list() + unresolvedlinks = list() for p in patterns: record = output - print p, sum(flistsize(record['flist'][p]).values())/10**9, 'GB approx' + #print p, sum(flistsize(record['flist'][p]).values())/10**9, 'GB approx' patternfiles.extend(record['flist'][p].keys() + [e for k in record['flist'][p] for e in record['flist'][p][k]] ) - + unresolvedlinks.extend([record['byino'][p][z] for z in record['byino'][p] if z < 0]) extra = [f for f in allfiles if not f in patternfiles] @@ -96,14 +110,18 @@ def main(args): keep.extend([e for e in record['flist'][k].keys() if not e in rmlist]) keep = sorted(list(set(keep))) + unexpected = check_extraneous(config["extraneous"], extra) print dumpf('./delete.list', '\n'.join(rmlist) + '\n') print dumpf('./masterfiles.list', '\n'.join(keep) + '\n') - print jdumpf('./unrecognized_files.json', extra) - - unexpected = check_extraneous(config["extraneous"], extra) - print "unexpected", unexpected + print dumpf('./extraneous_cromwell.list', '\n'.join(extra) + '\n') + print dumpf('./unresolvedfiles.list', '\n'.join(unresolvedlinks) + '\n') + print dumpf('./unexpectedfiles.list', '\n'.join(unexpected) + '\n') + + check_fileskept(keep) + print "unexpected files?", len(unexpected) > 0 + print "unresolved files?", len(unresolvedlinks) > 0 #print 'size', sum(flistsize(keep).values()) diff --git a/initpipe b/initpipe new file mode 100755 index 0000000..8bf536b --- /dev/null +++ b/initpipe @@ -0,0 +1,10 @@ +#!/bin/bash + +set -eufx -o pipefail + +cd encode-wrapper +chmod +x ./get_encode_resources.sh +./get_encode_resources.sh &> get_encode_resources.log +apy chip.py -get &> chip_get.log +apy chip.py -pullimage -bindpwd &> chip_pull_image.log +#apy chip.py -pullimage -bindpwd $PWD/v2/ihec &> chip_pull_image.log From 9acfa57de8f09f32df62eb7b9348f82b51491701 Mon Sep 17 00:00:00 2001 From: sitag Date: Thu, 1 Aug 2019 12:56:00 -0700 Subject: [PATCH 10/45] trapping find (really should do it differently) --- encode-wrapper/trackoutput.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index 14f1663..b6541f0 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -25,12 +25,17 @@ def findfiles(base, pattern): cmd = "find {0} -iname '{1}'".format(base, pattern) print cmd p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - return [e.strip() for e in p.stdout.readlines()] + #assert p.returncode == 0 + found = [e.strip() for e in p.stdout.readlines()] + for e in found: assert len(e.split()) == 1 + return found def listfiles(base): p = subprocess.Popen("find {0} -type f".format(base), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - return [e.strip() for e in p.stdout.readlines()] - + #assert p.returncode == 0 + found = [e.strip() for e in p.stdout.readlines()] + for e in found: assert len(e.split()) == 1 + return found def flistsize(fs): return {e : os.stat(e).st_size for e in fs} From 60482815371853f03cff2d925238449b987e5530 Mon Sep 17 00:00:00 2001 From: sitag Date: Thu, 15 Aug 2019 00:05:24 -0700 Subject: [PATCH 11/45] add rmsize tracking --- encode-wrapper/trackoutput.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index b6541f0..c6f42bf 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -89,7 +89,7 @@ def main(args): - + patternfiles = list() unresolvedlinks = list() for p in patterns: @@ -104,15 +104,26 @@ def main(args): print jdumpf('./file_shortreport.json', short_out) rmlist = list() + rmsize = 0 for ftype in config["delete"]: for k in record['flist'][ftype]: rmlist.extend(record['flist'][ftype][k]) rmlist.append(k) + try: + rmsize = rmsize + os.stat(k).st_size + except Exception as err: + print 'WARNING: __cannot_read_filesize__:', k, err - keep = list() + + keep, redundant = list(), list() for k in record['flist']: if not k in config['delete']: - keep.extend([e for e in record['flist'][k].keys() if not e in rmlist]) + keeping = [e for e in record['flist'][k].keys() if not e in rmlist] + keep.extend(keeping) + for z in keeping: + redundant.extend(record['flist'][k][z]) + + keep = sorted(list(set(keep))) unexpected = check_extraneous(config["extraneous"], extra) @@ -122,7 +133,8 @@ def main(args): print dumpf('./extraneous_cromwell.list', '\n'.join(extra) + '\n') print dumpf('./unresolvedfiles.list', '\n'.join(unresolvedlinks) + '\n') print dumpf('./unexpectedfiles.list', '\n'.join(unexpected) + '\n') - + print dumpf('./redundantlinks.list', '\n'.join(redundant) + '\n') + check_fileskept(keep) print "unexpected files?", len(unexpected) > 0 @@ -130,7 +142,7 @@ def main(args): #print 'size', sum(flistsize(keep).values()) - + print 'delete.list = ', rmsize if __name__ == '__main__': main(sys.argv[1:]) From a70dafe84870c1029bf2ee8364d4ca47ad60c477 Mon Sep 17 00:00:00 2001 From: sitag Date: Mon, 2 Sep 2019 21:32:59 -0400 Subject: [PATCH 12/45] add cleanup notes --- encode-wrapper/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index f1e7185..8ec85d4 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -134,4 +134,4 @@ See output of `./trackoutput.sh ` to see what f ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files - +The expected workflow if to remove files from `delete.list` only (in case diskspace is an issue). And then symlink files from `masterfiles.list` in an empty directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. From 520beadf60ffe78ddba320d0782f39ba6cf80797 Mon Sep 17 00:00:00 2001 From: sitag Date: Mon, 2 Sep 2019 21:37:33 -0400 Subject: [PATCH 13/45] add organising output section --- encode-wrapper/readme.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 8ec85d4..f82a506 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -125,6 +125,7 @@ This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. "failures": 0 } +## Organizing ENCODE output See output of `./trackoutput.sh ` to see what files are to be copied over. `trackoutput.sh` will write following lists of files: @@ -134,4 +135,6 @@ See output of `./trackoutput.sh ` to see what f ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files -The expected workflow if to remove files from `delete.list` only (in case diskspace is an issue). And then symlink files from `masterfiles.list` in an empty directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. +The recommended workflow if to remove files from `delete.list` only (in case diskspace is an issue). And then symlink files from `masterfiles.list` in an empty directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. + +It's expected that `unresolvedfiles.list` and `unexpectedfiles.list` are empty. If they are not empty, the files listed there will need to be looked at. Please review files before deleting to ensure nothing useful is removed. From e3e1ee84d86775729628954609acffc30281224a Mon Sep 17 00:00:00 2001 From: sitag Date: Mon, 30 Sep 2019 18:41:18 -0700 Subject: [PATCH 14/45] enable passing output directories --- encode-wrapper/config.py | 50 +++++++++++++++++++++++++++++++++++ encode-wrapper/trackoutput.py | 43 +++++++++++++++++------------- 2 files changed, 74 insertions(+), 19 deletions(-) create mode 100644 encode-wrapper/config.py diff --git a/encode-wrapper/config.py b/encode-wrapper/config.py new file mode 100644 index 0000000..a80a1a0 --- /dev/null +++ b/encode-wrapper/config.py @@ -0,0 +1,50 @@ +import utilsm +import sys + +class Config: + def __init__(self, args): + keyargs = dict() + def argtype(x): + if x.strip()[0] != '-': return 'values' + else: + tokens = x.split(':') + t = len(tokens) + if t == 2: + if tokens[0] in keyargs: raise Exception('malformed arguments '+ str(args) ) + else: keyargs[tokens[0]] = tokens[1] + elif t == 1: return 'flags' + else: raise Exception('malformed arguments '+ str(x)) + + parsed = utilsm.by_keyvalue(args, k = lambda x: argtype(x), v = lambda x: x) + self.values, self.keys, self.flags = parsed.get('values', []), keyargs, parsed.get('flags', []) + + def __getitem__(self, k): + if k in self.keys: return self.keys[k] + elif k in self.flags: return True + else: + raise Exception('__MISSING__') + + def get_values(self): + return self.values + + def get(self, k, defaultvalue = None): + if not defaultvalue: return self.keys[k] + else: return self.keys.get(k, defaultvalue) + + def or_else(self, k, defaultValue): + return self.keys.get(k, defaultValue) + + def has(self, flag): + return flag in self.flags or flag in self.keys.keys() + + def option(self, field): + return self.keys[field] + + def __str__(self): + return '#config = -flags:{0} -keys:{1} values:{2}'.format(self.flags, self.keys, self.values) + + + + @staticmethod + def sys(): + return Config(sys.argv[1:]) diff --git a/encode-wrapper/trackoutput.py b/encode-wrapper/trackoutput.py index c6f42bf..d5487aa 100644 --- a/encode-wrapper/trackoutput.py +++ b/encode-wrapper/trackoutput.py @@ -2,7 +2,7 @@ import os import re import sys - +from config import Config def check_extraneous(patterns, flist): @@ -19,8 +19,6 @@ def check_fileskept(keep): stats = os.stat(f) # this fill choke if there are issue with resolving hardlinking print '#keeping... ', os.path.basename(f), 'size', stats.st_size , 'ino', stats.st_ino - - def findfiles(base, pattern): cmd = "find {0} -iname '{1}'".format(base, pattern) print cmd @@ -31,7 +29,7 @@ def findfiles(base, pattern): return found def listfiles(base): - p = subprocess.Popen("find {0} -type f".format(base), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + p = subprocess.Popen("find {0} -type f".format(base), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) #assert p.returncode == 0 found = [e.strip() for e in p.stdout.readlines()] for e in found: assert len(e.split()) == 1 @@ -74,20 +72,26 @@ def make_filereport(patterns, base): return {'byino' : fbyino, 'flist':flist} -def main(args): +def main(cfg): + args = cfg.get_values() + filereport = True - assert len(args) == 1, '__only_one_target_directory_at_a_time__' + assert len(args) == 1, '__only_pass_one_target_directory_at_a_time__' arg = args[0] + print '# analyzing:', arg + + - config = jloadf('./cleanup.json') + config = jloadf(cfg.or_else("-cleanup", './cleanup.json')) + outdir = cfg.option('-outdir') + patterns = config["patterns"] assert not "extra" in patterns output = make_filereport(patterns, arg) short_out = {k: sorted(output['flist'][k].keys()) for k in output['flist']} allfiles = listfiles(arg) - patternfiles = list() @@ -100,8 +104,8 @@ def main(args): extra = [f for f in allfiles if not f in patternfiles] - print jdumpf('./filereport.json', output) - print jdumpf('./file_shortreport.json', short_out) + print jdumpf(outdir + '/filereport.json', output) + print jdumpf(outdir + '/file_shortreport.json', short_out) rmlist = list() rmsize = 0 @@ -128,12 +132,12 @@ def main(args): unexpected = check_extraneous(config["extraneous"], extra) - print dumpf('./delete.list', '\n'.join(rmlist) + '\n') - print dumpf('./masterfiles.list', '\n'.join(keep) + '\n') - print dumpf('./extraneous_cromwell.list', '\n'.join(extra) + '\n') - print dumpf('./unresolvedfiles.list', '\n'.join(unresolvedlinks) + '\n') - print dumpf('./unexpectedfiles.list', '\n'.join(unexpected) + '\n') - print dumpf('./redundantlinks.list', '\n'.join(redundant) + '\n') + print dumpf(outdir + '/delete.list', '\n'.join(rmlist) + '\n') + print dumpf(outdir + '/masterfiles.list', '\n'.join(keep) + '\n') + print dumpf(outdir + '/extraneous_cromwell.list', '\n'.join(extra) + '\n') + print dumpf(outdir + '/unresolvedfiles.list', '\n'.join(unresolvedlinks) + '\n') + print dumpf(outdir + '/unexpectedfiles.list', '\n'.join(unexpected) + '\n') + print dumpf(outdir + '/redundantlinks.list', '\n'.join(redundant) + '\n') check_fileskept(keep) @@ -141,10 +145,11 @@ def main(args): print "unresolved files?", len(unresolvedlinks) > 0 #print 'size', sum(flistsize(keep).values()) - print 'delete.list = ', rmsize -if __name__ == '__main__': - main(sys.argv[1:]) + print '# analyzed:', arg + +if __name__ == '__main__': + main(Config.sys()) From 1df6a00df0730e9e2e1fd5039a4be0864a884f31 Mon Sep 17 00:00:00 2001 From: sitag Date: Mon, 30 Sep 2019 18:42:50 -0700 Subject: [PATCH 15/45] fix documentation --- encode-wrapper/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index f82a506..82061ed 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -36,7 +36,7 @@ This will write: * trackoutput.sh -If you are running in `Local` mode using using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_0`, `$PWD/data_a` as `/mnt/ext_1` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and shift other bind points further along `ext_$i`'s. +If you are running in `Local` mode using using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_1`, `$PWD/data_a` as `/mnt/ext_2` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and other bind points further along `ext_$i`'s. For example, From ab778de4bb63e135ce4a37f8aafb7f0f2d940645 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Tue, 1 Oct 2019 13:27:01 -0400 Subject: [PATCH 16/45] Adding slurm singularity backend support Usage of Local or Slurm Singularity backend Cf. readme.md for more details. Usage of sambamba instead of singularity in IHEC test checking. Usage of Singularity image for IHEC test checking. --- .../backend_ihec_slurm_singularity.conf | 76 +++++++++++++++++++ encode-wrapper/chip.py | 61 +++++++++------ encode-wrapper/computemd5s.py | 4 +- encode-wrapper/encode_test_tasks_run.sh | 4 +- ...e_test_tasks_run_ihec_slurm_singularity.sh | 24 ++++++ encode-wrapper/headlessbam_md5 | 2 +- encode-wrapper/readme.md | 26 ++++--- encode-wrapper/testrun_tasks_template.sh | 26 +------ encode-wrapper/testrun_template.sh | 28 +++---- 9 files changed, 174 insertions(+), 77 deletions(-) create mode 100644 encode-wrapper/backend_ihec_slurm_singularity.conf create mode 100755 encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh diff --git a/encode-wrapper/backend_ihec_slurm_singularity.conf b/encode-wrapper/backend_ihec_slurm_singularity.conf new file mode 100644 index 0000000..7cd2d8e --- /dev/null +++ b/encode-wrapper/backend_ihec_slurm_singularity.conf @@ -0,0 +1,76 @@ +include required(classpath("application")) + +backend { + default = "Local" + providers { + + slurm_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String? slurm_partition + String? slurm_account + String? slurm_extra_param + String singularity_container + String? singularity_bindpath + """ + submit = """ + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (sbatch \ + --export=ALL \ + --mail-type=END,FAIL --mail-user=$JOB_MAIL \ + -A $RAP_ID \ + -J ${job_name} \ + -D ${cwd} \ + -o ${out} \ + -e ${err} \ + ${"-t " + time*60} \ + -n 1 \ + --ntasks-per-node=1 \ + ${"--cpus-per-task=" + cpu} \ + ${"--mem=" + memory_mb} \ + ${"-p " + slurm_partition} \ + ${"--account " + slurm_account} \ + ${"--gres gpu:" + gpu} \ + ${slurm_extra_param} \ + --wrap "chmod u+x ${script} && unset LD_LIBRARY_PATH && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") + """ + kill = "scancel ${job_id}" + check-alive = "squeue -j ${job_id}" + job-id-regex = "Submitted batch job (\\d+).*" + } + } + + Local { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + concurrent-job-limit = 10 + } + } + } +} + +services { + LoadController { + class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" + config { + # disable it (for login nodes on Stanford SCG, Sherlock) + control-frequency = 21474834 seconds + } + } +} + +system { + abort-jobs-on-terminate = true + graceful-server-shutdown = true +} + +call-caching { + enabled = false + invalidate-bad-cache-results = true +} diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index c2bde7a..f7d4a1b 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -127,16 +127,22 @@ def fix(fname, base): for f in mcf10a: print 'written:', fix(f, base) -def write_testrun(config): - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner.sh'.format(config['home']), infile.read().format(**config)) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(config['home']), infile.read().format(**config)) + '\n') - - logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} "${{1:-Local}}" ${{@:2}}\n\n'.format(**config))) - logerrn(dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**config))) - return dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec {additional_binds} {container_image} {home_mnt}/piperunner.sh {home_mnt} $1 $BACKEND\n\n'.format(**config)) +def write_testrun(l_config): + for i in range(len(l_config)):# config in l_config: + if i == 0: + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity\n\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n'.format(**l_config[i]))) + logerrn(dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh {home_mnt} Local $BACKEND\n\n'.format(**l_config[i]))) + else: + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**l_config[i])) def singularity_pull_image(home, config, binds, debug=debug_mode): @@ -176,17 +182,28 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): raise Exception('__couldNotCopy__:chip.wdl likey current directory is not bound in the container... ' + binds) logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') - return { - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend.conf".format(home_mnt) - } + return [{ + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend.conf".format(home_mnt) + }, + { + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend_ihec_slurm_singularity.conf".format(home_mnt) + }] def bindargs(args): @@ -233,8 +250,8 @@ def main(args): if '-pullimage' in args: params = [os.getcwd()] + [e for e in args if not e[0] == '-'] binds = bindargs(args) - container_config = singularity_pull_image(home, args, binds, debug = False) - container = write_testrun(container_config) + l_container_config = singularity_pull_image(home, args, binds, debug = False) + container = write_testrun(l_container_config) logerr('# container: {0}\n'.format(container)) if '-maketests' in args: diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py index a567407..d583d2e 100644 --- a/encode-wrapper/computemd5s.py +++ b/encode-wrapper/computemd5s.py @@ -32,9 +32,9 @@ def byino(fs): def md5script(hashed): def cmd(f): if f.strip().endswith('bam'): - return 'echo "{1} $(./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) else: - return 'echo "{1} $(md5sum {0})"'.format(f, os.path.basename(f)) + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif md5sum {0})"'.format(f, os.path.basename(f)) return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] diff --git a/encode-wrapper/encode_test_tasks_run.sh b/encode-wrapper/encode_test_tasks_run.sh index f51a98c..0f03891 100755 --- a/encode-wrapper/encode_test_tasks_run.sh +++ b/encode-wrapper/encode_test_tasks_run.sh @@ -3,12 +3,12 @@ BASE=$1 BACKEND=$2 -H=$BASE tag=${3:-""} +H=$BASE chmod +x $H/testrun_tasks.sh testsOut=$H/test_tasks_results_"$tag" -mkdir $testsOut || true +mkdir -p $testsOut || true cd $BASE/chip-seq-pipeline2/test/test_task echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" diff --git a/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh b/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh new file mode 100755 index 0000000..0a883e8 --- /dev/null +++ b/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh @@ -0,0 +1,24 @@ +#!/bin/bash + + +BASE=$1 +BACKEND=$2 +tag=${3:-""} +H=$BASE + +chmod +x $H/testrun_tasks_ihec_slurm_singularity.sh +testsOut=$H/test_tasks_results_"$tag" +mkdir -p $testsOut || true +cd $BASE/chip-seq-pipeline2/test/test_task +echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" + +for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do +#for t in test_bam2ta; do + echo "# started: $t $(date)" + $H/testrun_tasks_ihec_slurm_singularity.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND + echo "# end: $t $(date) $?" + echo "ok___________________" +done + + + diff --git a/encode-wrapper/headlessbam_md5 b/encode-wrapper/headlessbam_md5 index 2d4e62f..a10c488 100755 --- a/encode-wrapper/headlessbam_md5 +++ b/encode-wrapper/headlessbam_md5 @@ -2,5 +2,5 @@ -m="$(samtools view $1 | md5sum)" +m="$(sambamba view $1 | md5sum)" echo $m diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 82061ed..78ad44a 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -20,9 +20,9 @@ Run `chip.py -get` to get IHEC ChIP test data for MCF10A cell line. ## Pulling Singularity image and generating wrapper scripts -Check singularity version with `singularity --version` to make sure it's at least `2.5.2` . +Check singularity version with `singularity --version` to make sure it's at least `2.5.2`. -Then run `python chip.py -pullimage -bindpwd` . Bind pwd will mount the current directory (equivalent to arguments `-B $PWD`). Note that this means singularity must be a recent enough version to be able to bind to directories that do not exist on the image, since your `$PWD` may not exist on the image. Otherwise see `-pwd2ext0` option that binds $PWD to `/mnt/ext_0`. +Then run `python chip.py -pullimage -bindpwd`. `bindpwd` will mount the current directory (equivalent to arguments `-B $PWD`). Note that this means singularity must be a recent enough version to be able to bind to directories that do not exist on the image, since your `$PWD` may not exist on the image. Otherwise see `-pwd2ext0` option that binds $PWD to `/mnt/ext_0`. This will write: @@ -36,7 +36,7 @@ This will write: * trackoutput.sh -If you are running in `Local` mode using using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_1`, `$PWD/data_a` as `/mnt/ext_2` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and other bind points further along `ext_$i`'s. +If you are running in `Local` mode using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_0`, `$PWD/data_a` as `/mnt/ext_1` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and shift other bind points further along `ext_$i`'s. For example, @@ -48,17 +48,19 @@ will set up all binds so that after downloading the cemt0007 test data, you can without needing to do `chip.py -maketests` as later described. -This will also create the singularity image in `./images` . +This will also create the singularity image in `./images`. -Do `chmod +x ./*sh` +Do `chmod +x ./*sh`. -You can pass `-nobuild` if you hust want to regenerate the wrapper scripts without pulling the singularity image again. +You can pass `-nobuild` if you just want to regenerate the wrapper scripts without pulling the singularity image again. If you did not use `python ./chip.py -pullimage -bindpwd -pwd2ext0 $PWD/v2/ihec` then you will not be able to use `cemt0007_h3k*_mnt_ext_0.json` for tests, as the test data may not be mapped to `/ext/mnt_0`. See running tests below. ## Running tests -To run ENCODE test tasks, do `singularity_encode_test_tasks.sh Local try1`. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). Only Local is currently supported. The second is suffix for test output directory. The output of tests will be written in `test_tasks_results_try1` . Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh Local try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1`. + +Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. python ./status_encode_tasks.py ./test_tasks_results_try1 # ok:./test_tasks_results_try1/test_spr.test_task_output.json @@ -87,11 +89,15 @@ Doing `python chip.py -maketests` will write ChIP test configurations (you also * ./v2/ihec/cemt0007_h3k27me3.json -IHEC tests can be run with: +IHEC tests on Local mode can be run with: + +`./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` + +Or using SLURM with: -`./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` +`./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` -The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. Currently the only local mode is supported for singularity. The ENCODE documentation discusses other modes. +The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. To compute md5s of generated file, use `computemd5s.py `. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. diff --git a/encode-wrapper/testrun_tasks_template.sh b/encode-wrapper/testrun_tasks_template.sh index 22a97ba..a7279ec 100755 --- a/encode-wrapper/testrun_tasks_template.sh +++ b/encode-wrapper/testrun_tasks_template.sh @@ -1,36 +1,18 @@ #!/bin/bash -unset PYTHONPATH -unset R_LIBS_USER -which R -which python -which java - -echo "paths: $R_LIBS_USER $PYTHONPATH" -echo $PATH - +WDL=$1 +jobFile=$2 +RESULT=$3 +BACKEND=$4 CROMWELL_HOME="{home_mnt}" BACKEND_CONF="{backend}" WORKFLOW_OPT="{container}" -BACKEND="$4" - -WDL="$1" - PREFIX=$(basename $WDL .wdl) METADATA="$PREFIX".metadata.json # metadata -RESULT=$3 - - -jobFile=$2 java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $CROMWELL_HOME/cromwell-34.jar run $WDL -i $jobFile -o $WORKFLOW_OPT -m $METADATA echo "return:$?" cat $METADATA | python -c "import json,sys;obj=json.load(sys.stdin);print(obj['outputs']['"$PREFIX".compare_md5sum.json_str'])" > $RESULT cat $RESULT - - - - - diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index 884db91..1339aa9 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -1,25 +1,17 @@ #!/bin/bash - -cd $1 - -echo $PWD - -unset PYTHONPATH -unset R_LIBS_USER -which R -which python -which java - -echo "paths: $R_LIBS_USER $PYTHONPATH" -echo $PATH - +jobFile=$1 +BACKEND=$2 +if [[ $# -eq 3 ]]; then + OUTDIR="-Dbackend.providers.$BACKEND.config.root=$3" +else + OUTDIR="" +fi + +CROMWELL_HOME="{home_mnt}" BACKEND_CONF="{backend}" WORKFLOW_OPT="{container}" -BACKEND=$3 #"{backend_default}" CHIP="{wdl}" -jobFile=$2 - -java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT +java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $OUTDIR cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT echo "return:$?" From 7e9acc1d8e2c7a9c2089bb34307565be1dea7889 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 2 Oct 2019 14:34:40 -0400 Subject: [PATCH 17/45] Singularity usage with cleanenv Modification of usage of singularity within cromwell: cleanenv and not only unset LD_LIBRARY_PATH. Adding some more info in readme. --- encode-wrapper/backend_ihec_slurm_singularity.conf | 2 +- encode-wrapper/readme.md | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/encode-wrapper/backend_ihec_slurm_singularity.conf b/encode-wrapper/backend_ihec_slurm_singularity.conf index 7cd2d8e..36b761e 100644 --- a/encode-wrapper/backend_ihec_slurm_singularity.conf +++ b/encode-wrapper/backend_ihec_slurm_singularity.conf @@ -38,7 +38,7 @@ backend { ${"--account " + slurm_account} \ ${"--gres gpu:" + gpu} \ ${slurm_extra_param} \ - --wrap "chmod u+x ${script} && unset LD_LIBRARY_PATH && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") + --wrap "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") """ kill = "scancel ${job_id}" check-alive = "squeue -j ${job_id}" diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 78ad44a..3d12a51 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -28,8 +28,12 @@ This will write: * piperunner.sh +* piperunner_ihec_slurm_singularity.sh + * testrun_tasks.sh +* testrun_tasks_ihec_slurm_singularity.sh + * singularity_encode_test_tasks.sh * singularity_wrapper.sh @@ -99,11 +103,11 @@ Or using SLURM with: The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. -To compute md5s of generated file, use `computemd5s.py `. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. +To compute md5s of generated file, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `$PWD/h3k4me3_out`. So do - python computemd5s.py $PWD/h3k4me3_out _test + python computemd5s.py $PWD/h3k4me3_out test chmod +x ./computemd5s_test ./computemd5s_test > log_h3k4me3 python status_cemt.py log_h3k4me3 expected_md5s_h3k4me3.json From 5529e7c500869145a11d62c2b14f03c697da69c0 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Thu, 3 Oct 2019 11:40:11 -0400 Subject: [PATCH 18/45] Readme update --- encode-wrapper/readme.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 3d12a51..88491d0 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -12,9 +12,9 @@ Documemtation on how to define configs for IHEC standard workflows: [IHEC standa ## Downloading test data -First run `./get_encode_resources.sh` to get encode test dataset and hg38 genome files. +First run `./get_encode_resources.sh` to get encode test dataset and hg38 genome files. -By default it will use git over http. If you want to use ssh, then pass `ssh` as first argument +By default it will use git over http. If you want to use ssh, then pass `ssh` as first argument. Run `chip.py -get` to get IHEC ChIP test data for MCF10A cell line. @@ -22,7 +22,7 @@ Run `chip.py -get` to get IHEC ChIP test data for MCF10A cell line. Check singularity version with `singularity --version` to make sure it's at least `2.5.2`. -Then run `python chip.py -pullimage -bindpwd`. `bindpwd` will mount the current directory (equivalent to arguments `-B $PWD`). Note that this means singularity must be a recent enough version to be able to bind to directories that do not exist on the image, since your `$PWD` may not exist on the image. Otherwise see `-pwd2ext0` option that binds $PWD to `/mnt/ext_0`. +Then run `python chip.py -pullimage -bindpwd`. `bindpwd` will mount the current directory (equivalent to arguments `-B $PWD`). Note that this means singularity must be a recent enough version to be able to bind to directories that do not exist on the image, since your `$PWD` may not exist on the image. Otherwise see `-pwd2ext0` option that binds $PWD to `/mnt/ext_0`. This will write: @@ -42,7 +42,7 @@ This will write: If you are running in `Local` mode using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_0`, `$PWD/data_a` as `/mnt/ext_1` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and shift other bind points further along `ext_$i`'s. -For example, +For example, python ./chip.py -pullimage -bindpwd -pwd2ext0 $PWD/v2/ihec @@ -50,21 +50,21 @@ will set up all binds so that after downloading the cemt0007 test data, you can $ ./singularity_wrapper.sh cemt0007_h3k27me3_mnt_ext_0.json -without needing to do `chip.py -maketests` as later described. +without needing to do `chip.py -maketests` as later described. This will also create the singularity image in `./images`. Do `chmod +x ./*sh`. -You can pass `-nobuild` if you just want to regenerate the wrapper scripts without pulling the singularity image again. +You can pass `-nobuild` if you just want to regenerate the wrapper scripts without pulling the singularity image again. -If you did not use `python ./chip.py -pullimage -bindpwd -pwd2ext0 $PWD/v2/ihec` then you will not be able to use `cemt0007_h3k*_mnt_ext_0.json` for tests, as the test data may not be mapped to `/ext/mnt_0`. See running tests below. +If you did not use `python ./chip.py -pullimage -bindpwd -pwd2ext0 $PWD/v2/ihec` then you will not be able to use `cemt0007_h3k*_mnt_ext_0.json` for tests, as the test data may not be mapped to `/ext/mnt_0`. See running tests below. ## Running tests -To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh Local try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1`. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1`. -Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. +Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. python ./status_encode_tasks.py ./test_tasks_results_try1 # ok:./test_tasks_results_try1/test_spr.test_task_output.json @@ -87,7 +87,7 @@ Make sure all test pass, by looking through jsons generated. `./status_encode_ta "#ok": 14 } -Doing `python chip.py -maketests` will write ChIP test configurations (you also need to pass `-pwd2ext0` if you set `$PWD` to `/ext/mnt_0`) : +Doing `python chip.py -maketests` will write ChIP test configurations (you also need to pass `-pwd2ext0` if you set `$PWD` to `/ext/mnt_0`): * ./v2/ihec/cemt0007_h3k4me3.json @@ -101,20 +101,20 @@ Or using SLURM with: `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` -The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. +The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. -To compute md5s of generated file, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. +To compute md5s of generated file, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without teh bam header as that may contain full paths names. -As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `$PWD/h3k4me3_out`. So do +As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `$PWD/h3k4me3_out`. So do python computemd5s.py $PWD/h3k4me3_out test chmod +x ./computemd5s_test ./computemd5s_test > log_h3k4me3 python status_cemt.py log_h3k4me3 expected_md5s_h3k4me3.json -This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. +This will match md5s for cemt0007 H3K4me3 analysis. And similarly for H3K27me3. - $ python status_cemt.py computemd5s_0.out ./expected_md5s_h3k27me3.json + $ python status_cemt.py computemd5s_0.out ./expected_md5s_h3k27me3.json ok ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz 1c9554fe8b67e61fd7c69a1881ec2e3a ok conservative_peak.narrowPeak.hammock.gz b78724bb667cc7bbfece8a587c10c915 ok ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz defd886ab7923b952e04ee033a722fac @@ -145,6 +145,6 @@ See output of `./trackoutput.sh ` to see what f ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files -The recommended workflow if to remove files from `delete.list` only (in case diskspace is an issue). And then symlink files from `masterfiles.list` in an empty directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. +The recommended workflow if to remove files from `delete.list` only (in case diskspace is an issue). And then symlink files from `masterfiles.list` in an empty directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. -It's expected that `unresolvedfiles.list` and `unexpectedfiles.list` are empty. If they are not empty, the files listed there will need to be looked at. Please review files before deleting to ensure nothing useful is removed. +It's expected that `unresolvedfiles.list` and `unexpectedfiles.list` are empty. If they are not empty, the files listed there will need to be looked at. Please review files before deleting to ensure nothing useful is removed. From de56ad8a872505162d88765079bc8925ed5d9cef Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 11 Oct 2019 17:49:36 -0400 Subject: [PATCH 19/45] Fixing Local for IHEC test --- encode-wrapper/chip.py | 2 +- encode-wrapper/readme.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index f7d4a1b..de04663 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -135,7 +135,7 @@ def write_testrun(l_config): with open('testrun_tasks_template.sh') as infile: logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity\n\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n'.format(**l_config[i]))) - logerrn(dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh {home_mnt} Local $BACKEND\n\n'.format(**l_config[i]))) + logerrn(dumpf('./singularity_wrapper.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\nBACKEND="{backend_default}"\n\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND $2\n\n'.format(**l_config[i]))) else: with open('testrun_template.sh') as infile: logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 88491d0..06730e0 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -95,7 +95,7 @@ Doing `python chip.py -maketests` will write ChIP test configurations (you also IHEC tests on Local mode can be run with: -`./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` +`./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out` Or using SLURM with: From 7fabadad681ca19bb745f6a0977117570edcf64d Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 31 Jan 2020 12:12:40 -0500 Subject: [PATCH 20/45] Fixing sambamba issue + pbs support --- .../backend_ihec_pbs_singularity.conf | 65 +++ encode-wrapper/chip.py | 477 +++++++++--------- 2 files changed, 312 insertions(+), 230 deletions(-) create mode 100644 encode-wrapper/backend_ihec_pbs_singularity.conf diff --git a/encode-wrapper/backend_ihec_pbs_singularity.conf b/encode-wrapper/backend_ihec_pbs_singularity.conf new file mode 100644 index 0000000..047a39d --- /dev/null +++ b/encode-wrapper/backend_ihec_pbs_singularity.conf @@ -0,0 +1,65 @@ +include required(classpath("application")) + +backend { + default = "Local" + providers { + + pbs_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int time = 1 + Int memory_mb = 1024 + String singularity_container + String? singularity_bindpath + """ + submit = """ + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ + -N ${job_name} \ + -o ${out} \ + -e ${err} \ + -l nodes=1:ppn=${cpu} \ + -l mem=${memory_mb}MB \ + -l walltime=${time}:0:0 \ + ${if gpu>1 then "-lngpus=" + gpu else ""} \ + -V + ) + """ + kill = "qdel ${job_id}" + check-alive = "qstat -j ${job_id}" + job-id-regex = "(\\d+).*" + } + } + + Local { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + concurrent-job-limit = 10 + } + } + } +} + +services { + LoadController { + class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" + config { + # disable it (for login nodes on Stanford SCG, Sherlock) + control-frequency = 21474834 seconds + } + } +} + +system { + abort-jobs-on-terminate = true + graceful-server-shutdown = true +} + +call-caching { + enabled = false + invalidate-bad-cache-results = true +} diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index ed7e812..d044cef 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -14,261 +14,278 @@ debug_mode = False def base(): - return os.path.dirname(os.path.realpath(__file__)) + return os.path.dirname(os.path.realpath(__file__)) def wget(url, debug=debug_mode): - logerr('getting: {}\n'.format(url)) - if debug: - logerr(' ..debug: wget {0}\n'.format(url)) - dumpf(os.path.basename(url), 'test:{0}'.format(url)) - return - #p = subprocess.Popen('wget ' + url ,shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - #p = subprocess.Popen(['wget', url, '--directory-prefix', './test_data'] ,shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - #for line in p.stdout.readlines(): - # logerr(line) - p = subprocess.Popen('wget ' + url ,shell=True) - return p.wait() - - - + logerr('getting: {}\n'.format(url)) + if debug: + logerr(' ..debug: wget {0}\n'.format(url)) + dumpf(os.path.basename(url), 'test:{0}'.format(url)) + return + #p = subprocess.Popen('wget ' + url ,shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + #p = subprocess.Popen(['wget', url, '--directory-prefix', './test_data'] ,shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + #for line in p.stdout.readlines(): + # logerr(line) + p = subprocess.Popen('wget ' + url ,shell=True) + return p.wait() + + + def get_hg38_resources(home): - base = os.path.abspath(os.getcwd()) - mkdirs('hg38_resources/genome_hg38/bwa_index') - os.chdir('./hg38_resources') - for f in [ - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.blacklist.bed.gz', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.chrom.sizes', - ]: - wget(f) - movefile('hg38.blacklist.bed.gz', './genome_hg38/') - movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', './genome_hg38/') - movefile('hg38.chrom.sizes', './genome_hg38/') - movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', './genome_hg38/bwa_index/') - config = '\n'.join([ - "blacklist {0}/genome_hg38/hg38.blacklist.bed.gz", - "chrsz {0}/genome_hg38/hg38.chrom.sizes", - "gensz hs", - "bowtie2_idx_tar /dev/null", - "bwa_idx_tar {0}/genome_hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", - "ref_fa {0}/genome_hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta", - ]).format(base) + '\n' - logerr( dumpf('./hg38_local.tsv', config) + '\n' ) - base_config = jdumpf('./base_config.json', { 'chip.genome_tsv' : os.path.abspath('./hg38_local.tsv'), 'base' : base }) - os.chdir(home) - return base_config + base = os.path.abspath(os.getcwd()) + mkdirs('hg38_resources/genome_hg38/bwa_index') + os.chdir('./hg38_resources') + for f in [ + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.blacklist.bed.gz', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.chrom.sizes', + ]: + wget(f) + movefile('hg38.blacklist.bed.gz', './genome_hg38/') + movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', './genome_hg38/') + movefile('hg38.chrom.sizes', './genome_hg38/') + movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', './genome_hg38/bwa_index/') + config = '\n'.join([ + "blacklist {0}/genome_hg38/hg38.blacklist.bed.gz", + "chrsz {0}/genome_hg38/hg38.chrom.sizes", + "gensz hs", + "bowtie2_idx_tar /dev/null", + "bwa_idx_tar {0}/genome_hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", + "ref_fa {0}/genome_hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta", + ]).format(base) + '\n' + logerr(dumpf('./hg38_local.tsv', config) + '\n') + base_config = jdumpf('./base_config.json', {'chip.genome_tsv': os.path.abspath('./hg38_local.tsv'), 'base': base}) + os.chdir(home) + return base_config def existing_ref_config(configfile): - home = os.path.abspath(os.getcwd()) - config = jloadf(configfile) - hashed = { k : os.path.realpath(config[k]) if k in ['blacklist', 'chrsz', 'ref_fa', 'bwa_idx_tar'] else config[k] for k in config } - mkdirs('./hg38_resources') - os.chdir('./hg38_resources') - config = '\n'.join([e.strip() for e in '''blacklist {blacklist} - chrsz {chrsz} - gensz {gensz} - bowtie2_idx_tar {bowtie2_idx_tar} - bwa_idx_tar {bwa_idx_tar} - ref_fa {ref_fa}'''.format(**hashed).splitlines() ]) + '\n' - logerr( './hg38_resources/' + dumpf('./hg38_local.tsv', config) + '\n' ) - base_config = jdumpf('./base_config.json', { 'chip.genome_tsv' : os.path.abspath('./hg38_local.tsv'), 'base' : home }) - os.chdir(home) - return home + '/hg38_resources/base_config.json' + home = os.path.abspath(os.getcwd()) + config = jloadf(configfile) + hashed = {k: os.path.realpath(config[k]) if k in ['blacklist', 'chrsz', 'ref_fa', 'bwa_idx_tar'] else config[k] for k in config} + mkdirs('./hg38_resources') + os.chdir('./hg38_resources') + config = '\n'.join([e.strip() for e in '''blacklist {blacklist} + chrsz {chrsz} + gensz {gensz} + bowtie2_idx_tar {bowtie2_idx_tar} + bwa_idx_tar {bwa_idx_tar} + ref_fa {ref_fa}'''.format(**hashed).splitlines()]) + '\n' + logerr('./hg38_resources/' + dumpf('./hg38_local.tsv', config) + '\n') + base_config = jdumpf('./base_config.json', {'chip.genome_tsv': os.path.abspath('./hg38_local.tsv'), 'base': home}) + os.chdir(home) + return home + '/hg38_resources/base_config.json' def rm(target): - try: - shutil.rmtree(target) - except OSError as e: - logerr("# error: {0} / {1}".format(target, e.strerror)) + try: + shutil.rmtree(target) + except OSError as e: + logerr("# error: {0} / {1}".format(target, e.strerror)) def get_test_data(configfile, home): - config = jloadf(configfile) - os.chdir('./v2/ihec/test_data') - oks = dict() - for k in config['data']: - oks[k] = False - for url in config['data'][k]: - if wget(url) == 0: - oks[k] = True - break - else: - logerr('# failed downloading:' + url) - incomplete = glob.glob('./' + os.path.basename(url)) - if len(incomplete) > 0: - assert len(incomplete) == 1, incomplete - shutil.remove(incomplete[0]) - logerr('# removed failed download.. ' + incomplete[0]) - os.chdir(home) - for k in oks: - assert oks[k], ['could not download all test data', k] + config = jloadf(configfile) + os.chdir('./v2/ihec/test_data') + oks = dict() + for k in config['data']: + oks[k] = False + for url in config['data'][k]: + if wget(url) == 0: + oks[k] = True + break + else: + logerr('# failed downloading:' + url) + incomplete = glob.glob('./' + os.path.basename(url)) + if len(incomplete) > 0: + assert len(incomplete) == 1, incomplete + shutil.remove(incomplete[0]) + logerr('# removed failed download.. ' + incomplete[0]) + os.chdir(home) + for k in oks: + assert oks[k], ['could not download all test data', k] def make_tests(args): - mcf10a = ['cemt0007_h3k4me3_template.json', 'cemt0007_h3k27me3_template.json'] - - if os.path.isfile('./hg38_resources/base_config.json'): - config = jloadf('./hg38_resources/base_config.json') - base = config['base'] - else: - base = '/mnt/ext_0' if '-pwd2ext0' in args else os.path.abspath(os.getcwd()) - - def fix(fname, base): - assert fname.endswith('_template.json') - out = './v2/ihec/{0}.json'.format(fname[0:-len('_template.json')]) - config = jloadf(fname) - return dumpf(out, jsonp(config).replace('{0}', base)) - - for f in mcf10a: - print2('written:', fix(f, base)) + mcf10a = ['cemt0007_h3k4me3_template.json', 'cemt0007_h3k27me3_template.json'] + + if os.path.isfile('./hg38_resources/base_config.json'): + config = jloadf('./hg38_resources/base_config.json') + base = config['base'] + else: + base = '/mnt/ext_0' if '-pwd2ext0' in args else os.path.abspath(os.getcwd()) + + def fix(fname, base): + assert fname.endswith('_template.json') + out = './v2/ihec/{0}.json'.format(fname[0:-len('_template.json')]) + config = jloadf(fname) + return dumpf(out, jsonp(config).replace('{0}', base)) + + for f in mcf10a: + print2('written:', fix(f, base)) def write_testrun(l_config): - for i in range(len(l_config)):# config in l_config: - if i == 0: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - encode_tests = [ - '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' - ] - logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**l_config[i]))) - mcf_tests = [ - '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND\n\n' - ] - logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**l_config[i]))) - else: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**l_config[i])) + for i in range(len(l_config)):# config in l_config: + if i == 0: + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + + encode_tests = [ + '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' + ] + logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**l_config[i]))) + mcf_tests = [ + '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND\n\n' + ] + logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**l_config[i]))) + elif i == 1: + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + elif i == 2: + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner_ihec_pbs_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_pbs_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') + + return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**l_config[i])) def singularity_pull_image(home, config, binds, debug=debug_mode): - #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.2' - imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4' - image_version = imageurl.split(':')[-1].replace('.', '_') - os.chdir('./images') - if debug: - dumpf('./debug.img', 'test:{0}'.format('singularity')) - else: - cmd = 'singularity pull {0}'.format(imageurl) - logerr('# .. ' + cmd + '\n') - if not '-nobuild' in config: - shell(cmd, assert_ok = True) - - images = glob.glob('./*img') + glob.glob('./*.sif') - assert len(images) == 1, images - image_label = 'chip_seq_pipeline_{0}'.format(image_version) - image_ext = images[0].split('.')[-1] - image_name = '{0}.{1}'.format(image_label, image_ext) - logerr('# pulled image: {0}, moved: {1}\n'.format(images[0], image_name)) - os.rename(images[0], image_name) - image_path = os.path.abspath(image_name) - os.chdir(home) - home_mnt = "/mnt/ext_0" if '-pwd2ext0' in config else home - container_mnt = '{0}/v2/singularity_container.json'.format(home_mnt) - container = jdumpf('./v2/singularity_container.json', { - "default_runtime_attributes" : { - "singularity_container" : '{0}/images/{1}'.format(home_mnt, image_name) , - "singularity_instance_name": image_label - } - }) - - shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/v2/'.format(image_path, binds, home_mnt), assert_ok=True) - shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/'.format(image_path, binds, home_mnt), assert_ok=True) - if not os.path.exists('./chip.wdl') or not os.path.exists('./v2/chip.wdl'): - raise Exception('__could_not_copy__:chip.wdl likey current directory is not bound in the container... ' + binds) - logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') - logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') - return [{ - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend.conf".format(home_mnt) - }, - { - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend_ihec_slurm_singularity.conf".format(home_mnt) - }] + #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.' + imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1' + image_version = imageurl.split(':')[-1].replace('.', '_') + os.chdir('./images') + if debug: + dumpf('./debug.img', 'test:{0}'.format('singularity')) + else: + cmd = 'singularity pull {0}'.format(imageurl) + logerr('# .. ' + cmd + '\n') + if not '-nobuild' in config: + shell(cmd, assert_ok=True) + + images = glob.glob('./*img') + glob.glob('./*.sif') + assert len(images) == 1, images + image_label = 'chip_seq_pipeline_{0}'.format(image_version) + image_ext = images[0].split('.')[-1] + image_name = '{0}.{1}'.format(image_label, image_ext) + logerr('# pulled image: {0}, moved: {1}\n'.format(images[0], image_name)) + os.rename(images[0], image_name) + image_path = os.path.abspath(image_name) + os.chdir(home) + home_mnt = "/mnt/ext_0" if '-pwd2ext0' in config else home + container_mnt = '{0}/v2/singularity_container.json'.format(home_mnt) + container = jdumpf('./v2/singularity_container.json', { + "default_runtime_attributes" : { + "singularity_container" : '{0}/images/{1}'.format(home_mnt, image_name), + "singularity_instance_name": image_label + } + }) + + shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/v2/'.format(image_path, binds, home_mnt), assert_ok=True) + shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/'.format(image_path, binds, home_mnt), assert_ok=True) + if not os.path.exists('./chip.wdl') or not os.path.exists('./v2/chip.wdl'): + raise Exception('__could_not_copy__:chip.wdl likey current directory is not bound in the container... ' + binds) + logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') + logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') + return [ + { + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend.conf".format(home_mnt) + }, + { + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend_ihec_slurm_singularity.conf".format(home_mnt) + }, + { + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend_ihec_pbs_singularity.conf".format(home_mnt) + }] def bindargs(args): - binds = '' - if not '-bindpwd' in args: - return binds - if '-bindpwd' in args: - params = [e for e in args if not e[0] == '-'] - if '-pwd2ext0'in args: - bindpwd = '-B {0}:/mnt/ext_0'.format(os.getcwd()) - offset = 1 - else: - bindpwd = '-B ' + os.getcwd() - offset = 1 - - if not params: - return bindpwd - else: - return bindpwd + ',' + ','.join([ '{1}:/mnt/ext_{0}'.format(i + offset, e) for i,e in enumerate(params)]) - return binds + binds = '' + if not '-bindpwd' in args: + return binds + if '-bindpwd' in args: + params = [e for e in args if not e[0] == '-'] + if '-pwd2ext0'in args: + bindpwd = '-B {0}:/mnt/ext_0'.format(os.getcwd()) + offset = 1 + else: + bindpwd = '-B ' + os.getcwd() + offset = 1 + + if not params: + return bindpwd + else: + return bindpwd + ',' + ','.join([ '{1}:/mnt/ext_{0}'.format(i + offset, e) for i,e in enumerate(params)]) + return binds def main(args): - home = base() - logerr('# prefix {0}\n'.format(home)) - mkdirs('./hg38_resources') - mkdirs('./images') - mkdirs('./v2/ihec/test_data') - - if '-clean' in args: - for d in ['./v2', './images', './hg38_resources']: - logerr('# removing {0}\n'.format(d)) - rm(d) - logerr('rm -rf ./v2/ images/ hg38_resources/ \n') - - if '-getref' in args: - get_hg38_resources(home) - - if '-refconfig' in args: - logerr(existing_ref_config('./ref_config.json') + '\n') - - if '-get' in args: - get_test_data('./test_config.json', home) - - if '-pullimage' in args: - params = [os.getcwd()] + [e for e in args if not e[0] == '-'] - binds = bindargs(args) - l_container_config = singularity_pull_image(home, args, binds, debug = False) - container = write_testrun(l_container_config) - logerr('# container: {0}\n'.format(container)) - - if '-maketests' in args: - make_tests(args) - - - logerrn("__finished__") - - + home = base() + logerr('# prefix {0}\n'.format(home)) + mkdirs('./hg38_resources') + mkdirs('./images') + mkdirs('./v2/ihec/test_data') + + if '-clean' in args: + for d in ['./v2', './images', './hg38_resources']: + logerr('# removing {0}\n'.format(d)) + rm(d) + logerr('rm -rf ./v2/ images/ hg38_resources/ \n') + + if '-getref' in args: + get_hg38_resources(home) + + if '-refconfig' in args: + logerr(existing_ref_config('./ref_config.json') + '\n') + + if '-get' in args: + get_test_data('./test_config.json', home) + + if '-pullimage' in args: + params = [os.getcwd()] + [e for e in args if not e[0] == '-'] + binds = bindargs(args) + l_container_config = singularity_pull_image(home, args, binds, debug=False) + container = write_testrun(l_container_config) + logerr('# container: {0}\n'.format(container)) + + if '-maketests' in args: + make_tests(args) + + + logerrn("__finished__") + + if __name__ == '__main__': - main(sys.argv[1:]) + main(sys.argv[1:]) From 0f34feace90606ea72e4a5634704a5277f43a536 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 31 Jan 2020 15:44:42 -0500 Subject: [PATCH 21/45] PBS support fix --- ...ode_test_tasks_run_ihec_pbs_singularity.sh | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh diff --git a/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh b/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh new file mode 100755 index 0000000..6eee15d --- /dev/null +++ b/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh @@ -0,0 +1,24 @@ +#!/bin/bash + + +BASE=$1 +BACKEND=$2 +tag=${3:-""} +H=$BASE + +chmod +x $H/testrun_tasks_ihec_pbs_singularity.sh +testsOut=$H/test_tasks_results_"$tag" +mkdir -p $testsOut || true +cd $BASE/chip-seq-pipeline2/test/test_task +echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" + +for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do +#for t in test_bam2ta; do + echo "# started: $t $(date)" + $H/testrun_tasks_ihec_pbs_singularity.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND + echo "# end: $t $(date) $?" + echo "ok___________________" +done + + + From 82d44c581dfc0e346f7d4f17e95289148af7acc6 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 31 Jan 2020 15:50:44 -0500 Subject: [PATCH 22/45] PBS support readme modification --- encode-wrapper/readme.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 7f46688..20ee2ec 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -20,7 +20,7 @@ Run `python chip.py -get` to get IHEC ChIP test data for MCF10A cell line. ## Running on cluster -For running on cluster with a slurm etc see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) +For running on cluster with a SLURM or PBS etc see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) ## Memory requirements @@ -48,10 +48,14 @@ This command will write: * piperunner_ihec_slurm_singularity.sh +* piperunner_ihec_pbs_singularity.sh + * testrun_tasks.sh * testrun_tasks_ihec_slurm_singularity.sh +* testrun_tasks_ihec_pbs_singularity.sh + * singularity_encode_test_tasks.sh * singularity_wrapper.sh @@ -74,7 +78,7 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ### ENCODE tests -To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1`. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1` and for PBS do `./encode_test_tasks_run_ihec_pbs_singularity.sh pbs_singularity try1`. You will need atleast 10G of memory for running the encode tasks. @@ -113,10 +117,12 @@ IHEC tests on Local mode can be run with: `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` -You can also use SLURM with; please see [cluster](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) section. It's recommended that `singularity_runner.sh` is used instead for simplicity. +You can also use SLURM or PBS with; please see [cluster](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) section. It's recommended that `singularity_runner.sh` is used instead for simplicity. `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` +`./piperunner_ihec_pbs_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json pbs_singularity h3k4me3_out` and `./piperunner_ihec_pbs_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json pbs_singularity h3k27me3_out` + The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. For these tests, the running time can be 24 hours depending on hardware. From d2a34aab45412b25d8155980af73ad8c568ef7c0 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Mon, 3 Feb 2020 10:16:05 -0500 Subject: [PATCH 23/45] Update docker image --- encode-wrapper/chip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index d044cef..3a751ef 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -161,7 +161,7 @@ def write_testrun(l_config): def singularity_pull_image(home, config, binds, debug=debug_mode): #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.' - imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1' + imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1-rev1' image_version = imageurl.split(':')[-1].replace('.', '_') os.chdir('./images') if debug: From 077a00bc0a38330a0da300d182fe1d14e4132078 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Thu, 13 Feb 2020 16:53:58 -0500 Subject: [PATCH 24/45] Path to cromwell fix inside template for piperunner --- encode-wrapper/testrun_template.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index 1339aa9..4d379f9 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -13,5 +13,5 @@ BACKEND_CONF="{backend}" WORKFLOW_OPT="{container}" CHIP="{wdl}" -java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $OUTDIR cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT +java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $OUTDIR $CROMWELL_HOME/cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT echo "return:$?" From f2d0509c8ccb9e9a0634545b5a1bcfe8abf54a7b Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Thu, 13 Feb 2020 18:18:46 -0500 Subject: [PATCH 25/45] Adding cleanenv at singularity calling --- encode-wrapper/backend.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 874607c..7bcfe5a 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -96,7 +96,7 @@ backend { String? singularity_bindpath """ submit = """ - ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script} & echo $! && disown) + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script} & echo $! && disown) """ job-id-regex = "(\\d+)" check-alive = "ps -ef | grep -v grep | grep ${job_id}" From 0eddb1f0de0ae5a7b06d751ceaef649fa43888d2 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Thu, 24 Sep 2020 15:42:21 -0400 Subject: [PATCH 26/45] Test --- encode-wrapper/backend.conf | 107 +++--------------- encode-wrapper/chip.py | 97 +++++----------- .../compute_canada_beluga_ressources.json | 37 ++++++ encode-wrapper/encode_test_tasks_run.sh | 4 - ...ode_test_tasks_run_ihec_pbs_singularity.sh | 24 ---- ...e_test_tasks_run_ihec_slurm_singularity.sh | 24 ---- encode-wrapper/readme.md | 4 +- encode-wrapper/utilsm.pyc | Bin 0 -> 4102 bytes 8 files changed, 85 insertions(+), 212 deletions(-) create mode 100644 encode-wrapper/compute_canada_beluga_ressources.json delete mode 100755 encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh delete mode 100755 encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh create mode 100644 encode-wrapper/utilsm.pyc diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 7bcfe5a..1cef6cf 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -23,6 +23,8 @@ backend { submit = """ ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (sbatch \ --export=ALL \ + --mail-type=END,FAIL --mail-user=$JOB_MAIL \ + -A $RAP_ID \ -J ${job_name} \ -D ${cwd} \ -o ${out} \ @@ -36,7 +38,7 @@ backend { ${"--account " + slurm_account} \ ${"--gres gpu:" + gpu} \ ${slurm_extra_param} \ - --wrap "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") + --wrap "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") """ kill = "scancel ${job_id}" check-alive = "squeue -j ${job_id}" @@ -44,46 +46,35 @@ backend { } } - sge_singularity { + pbs_singularity { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { - script-epilogue = "sleep 30 && sync" + script-epilogue = "sleep 30" concurrent-job-limit = 50 runtime-attributes = """ - String sge_pe = "shm" Int cpu = 1 Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param + Int time = 1 + Int memory_mb = 1024 String singularity_container String? singularity_bindpath """ submit = """ - ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ - -S /bin/sh \ - -terse \ - -b n \ + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ -N ${job_name} \ - -wd ${cwd} \ -o ${out} \ -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \ - ${"-l h_vmem=" + memory_mb/cpu + "m"} \ - ${"-l s_vmem=" + memory_mb/cpu + "m"} \ - ${"-l h_rt=" + time*3600} \ - ${"-l s_rt=" + time*3600} \ - ${"-q " + sge_queue} \ - ${"-l gpu=" + gpu} \ - ${sge_extra_param} \ - -V) + -l nodes=1:ppn=${cpu} \ + -l mem=${memory_mb}MB \ + -l walltime=${time}:0:0 \ + ${if gpu>1 then "-lngpus=" + gpu else ""} \ + -V + ) """ kill = "qdel ${job_id}" check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" + job-id-regex = "(\\d+).*" } - } singularity { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" @@ -111,46 +102,6 @@ backend { } } - sge { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - String sge_pe = "shm" - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param - """ - submit = """ - qsub \ - -S /bin/sh \ - -terse \ - -b n \ - -N ${job_name} \ - -wd ${cwd} \ - -o ${out} \ - -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \ - ${"-l h_vmem=" + memory_mb/cpu + "m"} \ - ${"-l s_vmem=" + memory_mb/cpu + "m"} \ - ${"-l h_rt=" + time*3600} \ - ${"-l s_rt=" + time*3600} \ - ${"-q " + sge_queue} \ - ${"-l gpu=" + gpu} \ - ${sge_extra_param} \ - -V \ - ${script} - """ - kill = "qdel ${job_id}" - check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" - } - } - slurm { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { @@ -188,34 +139,6 @@ backend { job-id-regex = "Submitted batch job (\\d+).*" } } - - google { - actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory" - config { - # Google project - project = "your-project-name" - - # Base bucket for workflow executions - root = "gs://your-bucket-name" - - concurrent-job-limit = 1000 - genomics-api-queries-per-100-seconds = 1000 - maximum-polling-interval = 600 - - genomics { - auth = "application-default" - compute-service-account = "default" - endpoint-url = "https://genomics.googleapis.com/" - restrict-metadata-access = false - } - - filesystems { - gcs { - auth = "application-default" - } - } - } - } } } diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 20fb230..e35411a 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -18,15 +18,15 @@ def base(): def wget(url, debug=debug_mode): - logerr('getting: {}\n'.format(url)) - if debug: - logerr(' ..debug: wget {0}\n'.format(url)) - dumpf(os.path.basename(url), 'test:{0}'.format(url)) - return - p = subprocess.Popen('wget ' + url ,shell=True) - return p.wait() - - + logerr('getting: {}\n'.format(url)) + if debug: + logerr(' ..debug: wget {0}\n'.format(url)) + dumpf(os.path.basename(url), 'test:{0}'.format(url)) + return + p = subprocess.Popen('wget ' + url ,shell=True) + return p.wait() + + def get_hg38_resources(home): base = os.path.abspath(os.getcwd()) mkdirs('hg38_resources/genome_hg38/bwa_index') @@ -122,36 +122,24 @@ def fix(fname, base): for f in mcf10a: print2('written:', fix(f, base)) -def write_testrun(l_config): - for i in range(len(l_config)):# config in l_config: - if i == 0: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - encode_tests = [ - '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' - ] - logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**l_config[i]))) - mcf_tests = [ - '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND\n\n' - ] - logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**l_config[i]))) - elif i == 1: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - elif i == 2: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner_ihec_pbs_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_pbs_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**l_config[i])) +def write_testrun(config): + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner.sh'.format(config['home']), infile.read().format(**config)) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(config['home']), infile.read().format(**config)) + '\n') + + encode_tests = [ + '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' + ] + logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) + mcf_tests = [ + '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND\n\n' + ] + logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) + + return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**config)) def singularity_pull_image(home, config, binds, debug=debug_mode): @@ -191,8 +179,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): raise Exception('__could_not_copy__:chip.wdl likey current directory is not bound in the container... ' + binds) logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') - return [ - { + return { 'additional_binds' : binds, "container_image":image_path, "home" : home, @@ -202,29 +189,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): "container" : container_mnt, #os.path.abspath(container), "wdl" : "{0}/v2/chip.wdl".format(home_mnt), "backend" : "{0}/backend.conf".format(home_mnt) - }, - { - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend_ihec_slurm_singularity.conf".format(home_mnt) - }, - { - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend_ihec_pbs_singularity.conf".format(home_mnt) - }] + } def bindargs(args): @@ -272,8 +237,8 @@ def main(args): if '-pullimage' in args: params = [os.getcwd()] + [e for e in args if not e[0] == '-'] binds = bindargs(args) - l_container_config = singularity_pull_image(home, args, binds, debug=False) - container = write_testrun(l_container_config) + container_config = singularity_pull_image(home, args, binds, debug=False) + container = write_testrun(container_config) logerr('# container: {0}\n'.format(container)) if '-maketests' in args: diff --git a/encode-wrapper/compute_canada_beluga_ressources.json b/encode-wrapper/compute_canada_beluga_ressources.json new file mode 100644 index 0000000..5a55ba3 --- /dev/null +++ b/encode-wrapper/compute_canada_beluga_ressources.json @@ -0,0 +1,37 @@ +/*Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(chip.align_cpu x NUMBER_OF_REPLICATES, chip.call_peak_cpu x 2 x NUMBER_OF_REPLICATES) because align and call_peak (especially for spp) are bottlenecking tasks of the pipeline. Use this total number of cores if you manually qsub or sbatch your job (using local mode of Caper).*/ + +{ + "chip.bwa_cpu": 15, + "chip.bwa_mem_mb": 70500,/*mem per cpu 4700 so put "chip.align_cpu" x 4700 here */ + "chip.bwa_time_hr": 24, + + "chip.filter_cpu": 10, + "chip.filter_mem_mb": 47000,/*mem per cpu 4700 so put "chip.filter_cpu" x 4700 here */ + "chip.filter_time_hr": 12, + + "chip.bam2ta_cpu": 8, + "chip.bam2ta_mem_mb": 37600,/*mem per cpu 4700 so put "chip.bam2ta_cpu" x 4700 here */ + "chip.bam2ta_time_hr": 6, + + "chip.spr_mem_mb": 18800, + + "chip.fingerprint_cpu": 6, + "chip.fingerprint_mem_mb": 28200,/*mem per cpu 4700 so put "chip.fingerprint_cpu" x 4700 here */ + "chip.fingerprint_time_hr": 24, + + "chip.xcor_cpu": 6, + "chip.xcor_mem_mb": 28200,/*mem per cpu 4700 so put "chip.xcor_cpu" x 4700 here */ + "chip.xcor_time_hr": 24, + + "chip.macs2_mem_mb": 18800, + "chip.macs2_time_hr": 24, + + "chip.spp_cpu": 2, + "chip.spp_mem_mb": 16000,/*mem per cpu 4700 so put "chip.call_peak_cpu" x 4700 here */ + "chip.spp_time_hr": 24, + + /*There are special parameters to control maximum Java heap memory (e.g. java -Xmx4G) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter -Xmx. If these parameters are not defined then pipeline uses 90% of each task's memory (e.g. chip.filter_mem_mb).*/ + "chip.filter_picard_java_heap": "chip.filter_mem_mb", + "chip.align_trimmomatic_java_heap": "chip.align_mem_mb", + "chip.gc_bias_picard_java_heap": "10G" +} \ No newline at end of file diff --git a/encode-wrapper/encode_test_tasks_run.sh b/encode-wrapper/encode_test_tasks_run.sh index 0f03891..17c1447 100755 --- a/encode-wrapper/encode_test_tasks_run.sh +++ b/encode-wrapper/encode_test_tasks_run.sh @@ -13,12 +13,8 @@ cd $BASE/chip-seq-pipeline2/test/test_task echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do -#for t in test_bam2ta; do echo "# started: $t $(date)" $H/testrun_tasks.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND echo "# end: $t $(date) $?" echo "ok___________________" done - - - diff --git a/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh b/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh deleted file mode 100755 index 6eee15d..0000000 --- a/encode-wrapper/encode_test_tasks_run_ihec_pbs_singularity.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - - -BASE=$1 -BACKEND=$2 -tag=${3:-""} -H=$BASE - -chmod +x $H/testrun_tasks_ihec_pbs_singularity.sh -testsOut=$H/test_tasks_results_"$tag" -mkdir -p $testsOut || true -cd $BASE/chip-seq-pipeline2/test/test_task -echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" - -for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do -#for t in test_bam2ta; do - echo "# started: $t $(date)" - $H/testrun_tasks_ihec_pbs_singularity.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND - echo "# end: $t $(date) $?" - echo "ok___________________" -done - - - diff --git a/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh b/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh deleted file mode 100755 index 0a883e8..0000000 --- a/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - - -BASE=$1 -BACKEND=$2 -tag=${3:-""} -H=$BASE - -chmod +x $H/testrun_tasks_ihec_slurm_singularity.sh -testsOut=$H/test_tasks_results_"$tag" -mkdir -p $testsOut || true -cd $BASE/chip-seq-pipeline2/test/test_task -echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" - -for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do -#for t in test_bam2ta; do - echo "# started: $t $(date)" - $H/testrun_tasks_ihec_slurm_singularity.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND - echo "# end: $t $(date) $?" - echo "ok___________________" -done - - - diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 0e75777..04b83f7 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -90,9 +90,9 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ### ENCODE tests -To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1` and for PBS do `./encode_test_tasks_run_ihec_pbs_singularity.sh pbs_singularity try1`. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. -You will need atleast 10G of memory for running the encode tasks. +You will need atleast 10G of memory for running the encode test tasks. Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. diff --git a/encode-wrapper/utilsm.pyc b/encode-wrapper/utilsm.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36714a2ae48f63d95cb7e8e66ad909ffed447436 GIT binary patch literal 4102 zcmc&%U2hvj6uo0RPV72O(oX^fN-I%=0JlZFlnSUQ6oC|A14?DJvbHE| zn!J!cBK7C-LwMo|&bb@g3grn2*Y>XOeC*6UXYSWl|5$GRdH$PTBqyH={=X0-?so_l zKO?Eg&`8&aVY?z}MYnfZPi)l;p92>X9HnekDY*U4^V$Z0sBDST%s>Ewzmv#O*v1hgOyx0{LE{I)K z;Ue|lY_9=Z@)tgB1IeGoZB!(g@Ab!7^ix6OKl%17+gv zBJ{~#+zYcXJ#dNZMLWr;6K7Fw;}7=>%!`Z8*e9tQel$Apb!?z0ZZ{w~DlR4>23{Qf zk~r2O0LQK&6r8TdQQ}vXV^i5|Q}GS23*?3G4dMf*3n!U{iV26?Rjxp3*X5ZfayuRZ z6T5yY#&JmjHyzFsKNX=yqh$ya_+)|lF5*)79SG$y0{d}AUc4?x6*;Vs5KwW5PW_oJ zEArz*aJ1RxA{(B7_GjSp5eVU)`$gj8pb4oQLRoP8~QLgxw_1C2qjUV z;l-L9A#@lLa}4JwNr10R!XY13!_X*F#Gy@-EOw>AsD;q@lI&GDg=*5jhW#DQRLsf@ z?$jmT#lQ?M`~iUha3`_hWyCzp(iOOdhQTxs?F@178v5ty3e;nlI`nTqI*payK#lci z--H0HI7{F|K-{e|E1X0?Jyag&hT{XzZXRiRD2?|dFF0@GqR<$id7_^e!_aF6&=5Fp z)M>sAeLjG?lMFz-{|PfK1elu&I=F&XK)Mub@D`+3gOXvSM42&{OvFN}6DRwV!31)e(M8}gw>IwM8EHmZU_MO+{_+h2`jI;rfOlnRUz zZYNGt$2@R7ewz3@&u$%E1#<~Hq)mN2LyQ|g85MaHJEv*$2PV*9iKEncKTLdZ6@pT$ zCD}cPYKhg@S4EK*Guw-XR%1)3aQ`{lT<{Jw{uq~IDim|kw9F+_$A3hd zFsOJDBnB4AB1!^fqiw`=& z)H@?I0Iu}Hr&#L0 zvOHOpEeua`GC5U(Y!Z1D(!_aXk8cr~ExnW$73v7-E8S{OXYGXWaT{FZI)pTc^$%>L zF|PuTb?1``{C1h?2lrnHt1xN>lgH!c_OwJBh^q%NhAO2gRKVd`QH1Hny>e!?4Fea4JR t|Nr|RZSU?dx8pSaOwFH#%<2g*%1)(G!*`*v&|2iZQEk*(OO1M?{x@r|kf;Cv literal 0 HcmV?d00001 From 1a0c331ec830220c0de1726d0e881c8408b3f915 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 25 Sep 2020 11:04:39 -0400 Subject: [PATCH 27/45] test --- encode-wrapper/chip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index e35411a..00d6cbc 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -186,7 +186,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): "home_mnt": home_mnt, "bind_opt": "${3:-}", "backend_default" : "${2:-Local}", - "container" : container_mnt, #os.path.abspath(container), + "container" : os.path.abspath(container), #container_mnt, "wdl" : "{0}/v2/chip.wdl".format(home_mnt), "backend" : "{0}/backend.conf".format(home_mnt) } From ed2dc1d648292f75e2ff8610c78621bb2a1944f2 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 30 Sep 2020 17:12:42 -0400 Subject: [PATCH 28/45] Test --- encode-wrapper/chip.py | 2 +- encode-wrapper/utilsm.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 00d6cbc..e35411a 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -186,7 +186,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): "home_mnt": home_mnt, "bind_opt": "${3:-}", "backend_default" : "${2:-Local}", - "container" : os.path.abspath(container), #container_mnt, + "container" : container_mnt, #os.path.abspath(container), "wdl" : "{0}/v2/chip.wdl".format(home_mnt), "backend" : "{0}/backend.conf".format(home_mnt) } diff --git a/encode-wrapper/utilsm.py b/encode-wrapper/utilsm.py index de4675a..d7208a7 100644 --- a/encode-wrapper/utilsm.py +++ b/encode-wrapper/utilsm.py @@ -77,7 +77,3 @@ def by_keyvalue(alist, k, v): hashed[ke] = list() hashed[ke].append(ve) return hashed - - - - From 3b68f5c9773c696588be0900a5eff17c0b68b67a Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Tue, 6 Oct 2020 15:52:51 -0400 Subject: [PATCH 29/45] Test --- encode-wrapper/backend.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 1cef6cf..90d0d6c 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -75,6 +75,7 @@ backend { check-alive = "qstat -j ${job_id}" job-id-regex = "(\\d+).*" } + } singularity { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" From 0357c05c93e2fec6dc0217230742a7e6f7c76d30 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Tue, 6 Oct 2020 17:15:06 -0400 Subject: [PATCH 30/45] Test --- encode-wrapper/backend.conf | 10 ---------- encode-wrapper/chip.py | 4 +++- encode-wrapper/status_encode_tasks.py | 2 ++ 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 90d0d6c..d6d9122 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -162,13 +162,3 @@ call-caching { enabled = false invalidate-bad-cache-results = true } - -google { - application-name = "cromwell" - auths = [ - { - name = "application-default" - scheme = "application_default" - } - ] -} diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index e35411a..45cb696 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import subprocess import sys @@ -143,7 +145,7 @@ def write_testrun(config): def singularity_pull_image(home, config, binds, debug=debug_mode): - #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.' + #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.2' imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1-rev1' image_version = imageurl.split(':')[-1].replace('.', '_') os.chdir('./images') diff --git a/encode-wrapper/status_encode_tasks.py b/encode-wrapper/status_encode_tasks.py index 681ae94..379d6cc 100644 --- a/encode-wrapper/status_encode_tasks.py +++ b/encode-wrapper/status_encode_tasks.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from utilsm import * import sys import glob From a735a2e0ef36f2bfec707d5c55ef86485a1d4ee6 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Tue, 6 Oct 2020 18:31:25 -0400 Subject: [PATCH 31/45] Test --- encode-wrapper/chip.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 45cb696..38064f2 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -132,12 +132,12 @@ def write_testrun(config): encode_tests = [ '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{1:-test}}\n\n' ] logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) mcf_tests = [ '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND\n\n' + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $2 $BACKEND\n\n' ] logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) @@ -187,7 +187,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): "home" : home, "home_mnt": home_mnt, "bind_opt": "${3:-}", - "backend_default" : "${2:-Local}", + "backend_default" : "${3:-Local}", "container" : container_mnt, #os.path.abspath(container), "wdl" : "{0}/v2/chip.wdl".format(home_mnt), "backend" : "{0}/backend.conf".format(home_mnt) From 7e52f38f21b0529b8e87ed018a8ed6dfa42a13b0 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Oct 2020 11:11:42 -0400 Subject: [PATCH 32/45] Test --- encode-wrapper/chip.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 38064f2..d870f5c 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -132,12 +132,12 @@ def write_testrun(config): encode_tests = [ '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{1:-test}}\n\n' + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' ] logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) mcf_tests = [ '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $2 $BACKEND\n\n' + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND $2\n\n' ] logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) @@ -187,7 +187,7 @@ def singularity_pull_image(home, config, binds, debug=debug_mode): "home" : home, "home_mnt": home_mnt, "bind_opt": "${3:-}", - "backend_default" : "${3:-Local}", + "backend_default" : "${2:-Local}", "container" : container_mnt, #os.path.abspath(container), "wdl" : "{0}/v2/chip.wdl".format(home_mnt), "backend" : "{0}/backend.conf".format(home_mnt) From 6de370e1eb7470359b75f19bbb4b91db78bc6e45 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Oct 2020 11:47:43 -0400 Subject: [PATCH 33/45] Changing the usage and updating the doc --- encode-wrapper/backend.conf | 40 ------------------------------ encode-wrapper/chip.py | 4 +-- encode-wrapper/readme.md | 6 +++++ encode-wrapper/testrun_template.sh | 4 +-- 4 files changed, 10 insertions(+), 44 deletions(-) diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index d6d9122..0f3f944 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -103,46 +103,6 @@ backend { } } - slurm { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? slurm_partition - String? slurm_account - String? slurm_extra_param - """ - submit = """ - sbatch \ - --export=ALL \ - -J ${job_name} \ - -D ${cwd} \ - -o ${out} \ - -e ${err} \ - ${"-t " + time*60} \ - -n 1 \ - --ntasks-per-node=1 \ - ${"--cpus-per-task=" + cpu} \ - ${"--mem=" + memory_mb} \ - ${"-p " + slurm_partition} \ - ${"--account " + slurm_account} \ - ${"--gres gpu:" + gpu} \ - ${slurm_extra_param} \ - --wrap "/bin/bash ${script}" - """ - kill = "scancel ${job_id}" - check-alive = "squeue -j ${job_id}" - job-id-regex = "Submitted batch job (\\d+).*" - } - } - } -} - services { LoadController { class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index d870f5c..594c223 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -136,8 +136,8 @@ def write_testrun(config): ] logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) mcf_tests = [ - '#!/bin/bash', 'echo "home:$PWD"', "which singularity", 'BACKEND="{backend_default}"', - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $BACKEND $2\n\n' + '#!/bin/bash', 'echo "home:$PWD"', "which singularity", + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $2 Local\n\n' ] logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 04b83f7..467f206 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -88,6 +88,12 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ## Running tests +/!\ There ar two scripts for running the pipeline for two specific use cases: +- `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. +- `piperunner.sh` for either a Local use or a HPC use. Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. + +To use custom ressources you can add to your input.json file specific sections. For Compute Canada Beluga users the file `compute_canada_beluga_ressources.json` is already defined; you can refer to this one for other HPCs. + ### ENCODE tests To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index 4d379f9..b5f8196 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -1,12 +1,12 @@ #!/bin/bash jobFile=$1 -BACKEND=$2 -if [[ $# -eq 3 ]]; then +if [[ $# -eq 2 ]]; then OUTDIR="-Dbackend.providers.$BACKEND.config.root=$3" else OUTDIR="" fi +BACKEND=$3 CROMWELL_HOME="{home_mnt}" BACKEND_CONF="{backend}" From ee6c61034cc0f69258fc704aed9bf7f1d88c6003 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Oct 2020 11:50:34 -0400 Subject: [PATCH 34/45] Doc update --- encode-wrapper/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 467f206..c4c8e96 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -90,7 +90,7 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ /!\ There ar two scripts for running the pipeline for two specific use cases: - `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. -- `piperunner.sh` for either a Local use or a HPC use. Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. +- `piperunner.sh` for either a Local use or a HPC use. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded (for Compute Canada users you can do `module load singularity/3.5 mugqic/java/openjdk-jdk1.8.0_72 mugqic/python/3.7.3`) Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. To use custom ressources you can add to your input.json file specific sections. For Compute Canada Beluga users the file `compute_canada_beluga_ressources.json` is already defined; you can refer to this one for other HPCs. From e5fea8719a27faecde8bc9597fd31654a29869a5 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Oct 2020 11:53:10 -0400 Subject: [PATCH 35/45] Doc update --- encode-wrapper/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index c4c8e96..88950e7 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -88,7 +88,7 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ## Running tests -/!\ There ar two scripts for running the pipeline for two specific use cases: +/!\ There are two scripts for running the pipeline for two specific use cases: - `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. - `piperunner.sh` for either a Local use or a HPC use. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded (for Compute Canada users you can do `module load singularity/3.5 mugqic/java/openjdk-jdk1.8.0_72 mugqic/python/3.7.3`) Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. From 8e4a668dfcc91e748d288b69527a94ee45df24ae Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Oct 2020 12:31:54 -0400 Subject: [PATCH 36/45] Fixing backend.conf --- encode-wrapper/backend.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 0f3f944..ecba615 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -102,6 +102,8 @@ backend { concurrent-job-limit = 10 } } + } +} services { LoadController { From 6f7b2475be7578fd2941f1fd3627a4a428e3bf17 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Thu, 8 Oct 2020 16:46:27 -0400 Subject: [PATCH 37/45] Changing back --- encode-wrapper/chip.py | 2 +- encode-wrapper/readme.md | 6 ++---- encode-wrapper/testrun_template.sh | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index 594c223..79384fb 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -137,7 +137,7 @@ def write_testrun(config): logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) mcf_tests = [ '#!/bin/bash', 'echo "home:$PWD"', "which singularity", - '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 $2 Local\n\n' + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 Local $2\n\n' ] logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 88950e7..e292873 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -90,7 +90,7 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ /!\ There are two scripts for running the pipeline for two specific use cases: - `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. -- `piperunner.sh` for either a Local use or a HPC use. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded (for Compute Canada users you can do `module load singularity/3.5 mugqic/java/openjdk-jdk1.8.0_72 mugqic/python/3.7.3`) Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. +- `piperunner.sh` for either a Local use or a HPC use. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded. For Compute Canada users you can add `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc` and then do `module load singularity/3.5 mugqic/java/openjdk-jdk1.8.0_72 mugqic/python/3.7.3`. Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. To use custom ressources you can add to your input.json file specific sections. For Compute Canada Beluga users the file `compute_canada_beluga_ressources.json` is already defined; you can refer to this one for other HPCs. @@ -137,9 +137,7 @@ IHEC tests on Local mode can be run with: You can also use SLURM or PBS with the pipeline; please see [cluster](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) section. It's recommended that `singularity_runner.sh` is used instead for simplicity. -`./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` - -`./piperunner_ihec_pbs_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json pbs_singularity h3k4me3_out` and `./piperunner_ihec_pbs_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json pbs_singularity h3k27me3_out` +`./piperunner.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` or replacing slurm_singularity by pbs_singularity for pbs HPCs. The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index b5f8196..4d379f9 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -1,12 +1,12 @@ #!/bin/bash jobFile=$1 -if [[ $# -eq 2 ]]; then +BACKEND=$2 +if [[ $# -eq 3 ]]; then OUTDIR="-Dbackend.providers.$BACKEND.config.root=$3" else OUTDIR="" fi -BACKEND=$3 CROMWELL_HOME="{home_mnt}" BACKEND_CONF="{backend}" From 8e6a18c0a72e31ba6a6b91fa6b7e96c4da8d3237 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 3 Mar 2021 16:22:23 -0500 Subject: [PATCH 38/45] Changing Compute Canada behaviour --- .../compute_canada_beluga_ressources.json | 37 ------------------ .../computecanada_encode_test_tasks.sh | 11 ++++++ encode-wrapper/computecanada_ressources.json | 38 +++++++++++++++++++ encode-wrapper/computecanada_wrapper.sh | 21 ++++++++++ encode-wrapper/encode_test_tasks_run.sh | 2 +- encode-wrapper/readme.md | 26 ++++++++++--- encode-wrapper/testrun_template.sh | 2 +- 7 files changed, 93 insertions(+), 44 deletions(-) delete mode 100644 encode-wrapper/compute_canada_beluga_ressources.json create mode 100644 encode-wrapper/computecanada_encode_test_tasks.sh create mode 100644 encode-wrapper/computecanada_ressources.json create mode 100644 encode-wrapper/computecanada_wrapper.sh diff --git a/encode-wrapper/compute_canada_beluga_ressources.json b/encode-wrapper/compute_canada_beluga_ressources.json deleted file mode 100644 index 5a55ba3..0000000 --- a/encode-wrapper/compute_canada_beluga_ressources.json +++ /dev/null @@ -1,37 +0,0 @@ -/*Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(chip.align_cpu x NUMBER_OF_REPLICATES, chip.call_peak_cpu x 2 x NUMBER_OF_REPLICATES) because align and call_peak (especially for spp) are bottlenecking tasks of the pipeline. Use this total number of cores if you manually qsub or sbatch your job (using local mode of Caper).*/ - -{ - "chip.bwa_cpu": 15, - "chip.bwa_mem_mb": 70500,/*mem per cpu 4700 so put "chip.align_cpu" x 4700 here */ - "chip.bwa_time_hr": 24, - - "chip.filter_cpu": 10, - "chip.filter_mem_mb": 47000,/*mem per cpu 4700 so put "chip.filter_cpu" x 4700 here */ - "chip.filter_time_hr": 12, - - "chip.bam2ta_cpu": 8, - "chip.bam2ta_mem_mb": 37600,/*mem per cpu 4700 so put "chip.bam2ta_cpu" x 4700 here */ - "chip.bam2ta_time_hr": 6, - - "chip.spr_mem_mb": 18800, - - "chip.fingerprint_cpu": 6, - "chip.fingerprint_mem_mb": 28200,/*mem per cpu 4700 so put "chip.fingerprint_cpu" x 4700 here */ - "chip.fingerprint_time_hr": 24, - - "chip.xcor_cpu": 6, - "chip.xcor_mem_mb": 28200,/*mem per cpu 4700 so put "chip.xcor_cpu" x 4700 here */ - "chip.xcor_time_hr": 24, - - "chip.macs2_mem_mb": 18800, - "chip.macs2_time_hr": 24, - - "chip.spp_cpu": 2, - "chip.spp_mem_mb": 16000,/*mem per cpu 4700 so put "chip.call_peak_cpu" x 4700 here */ - "chip.spp_time_hr": 24, - - /*There are special parameters to control maximum Java heap memory (e.g. java -Xmx4G) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter -Xmx. If these parameters are not defined then pipeline uses 90% of each task's memory (e.g. chip.filter_mem_mb).*/ - "chip.filter_picard_java_heap": "chip.filter_mem_mb", - "chip.align_trimmomatic_java_heap": "chip.align_mem_mb", - "chip.gc_bias_picard_java_heap": "10G" -} \ No newline at end of file diff --git a/encode-wrapper/computecanada_encode_test_tasks.sh b/encode-wrapper/computecanada_encode_test_tasks.sh new file mode 100644 index 0000000..ae11736 --- /dev/null +++ b/encode-wrapper/computecanada_encode_test_tasks.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +OUTPUTDIR=$1 + +JOB_OUTPUT=encode_test.$OUTPUTDIR.log +JOB_NAME=encode_test.$OUTPUTDIR +cat /dev/null > $JOB_OUTPUT +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash ./singularity_encode_test_tasks.sh $1" +current_JOBID=$(echo "#!/bin/bash +$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=01:00:00 --mem-per-cpu=4700M -n 5 -N 1 | grep "[0-9]" | cut -d\ -f4) +echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/computecanada_ressources.json b/encode-wrapper/computecanada_ressources.json new file mode 100644 index 0000000..d83affb --- /dev/null +++ b/encode-wrapper/computecanada_ressources.json @@ -0,0 +1,38 @@ +{ + "_COMMENT.General": "Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(chip.align_cpu x NUMBER_OF_REPLICATES, chip.call_peak_cpu x 2 x NUMBER_OF_REPLICATES) because align and call_peak (especially for spp) are bottlenecking tasks of the pipeline. Use this total number of cores if you manually qsub or sbatch your job (using local mode of Caper).", + "_COMMENT.General.Beluga": "The amount of mem per cpu is 4700M in Beluga so you can put \"chip.whatever_cpu\" x 4700 for any \"chip.whatever_mem_mb\".", + + "chip.bwa_cpu": 15, + "chip.bwa_mem_mb": 70500, + "chip.bwa_time_hr": 24, + + "chip.filter_cpu": 10, + "chip.filter_mem_mb": 47000, + "chip.filter_time_hr": 12, + + "chip.bam2ta_cpu": 8, + "chip.bam2ta_mem_mb": 37600, + "chip.bam2ta_time_hr": 6, + + "chip.spr_mem_mb": 18800, + + "chip.fingerprint_cpu": 6, + "chip.fingerprint_mem_mb": 28200, + "chip.fingerprint_time_hr": 24, + + "chip.xcor_cpu": 6, + "chip.xcor_mem_mb": 28200, + "chip.xcor_time_hr": 24, + + "chip.macs2_mem_mb": 18800, + "chip.macs2_time_hr": 24, + + "chip.spp_cpu": 2, + "chip.spp_mem_mb": 16000, + "chip.spp_time_hr": 24, + + "_COMMENT.Java.general": "There are special parameters to control maximum Java heap memory (e.g. java -Xmx4G) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter -Xmx. If these parameters are not defined then pipeline uses 90% of each task's memory (e.g. chip.filter_mem_mb).", + "chip.filter_picard_java_heap": "chip.filter_mem_mb", + "chip.align_trimmomatic_java_heap": "chip.align_mem_mb", + "chip.gc_bias_picard_java_heap": "10G" +} \ No newline at end of file diff --git a/encode-wrapper/computecanada_wrapper.sh b/encode-wrapper/computecanada_wrapper.sh new file mode 100644 index 0000000..af8e721 --- /dev/null +++ b/encode-wrapper/computecanada_wrapper.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +PIPERUNNER=$1 +jobFile=$2 +BACKEND=$3 +if [[ $# -eq 4 ]]; then + OUTDIR=$4 +else + OUTDIR="" +fi + +filename=$(basename -- $2) +filename="${filename%.*}" + +JOB_OUTPUT=chipseq.$filename.log +JOB_NAME=chipseq.$filename +cat /dev/null > $JOB_OUTPUT +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2 $3 $4" +current_JOBID=$(echo "#!/bin/bash +$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=48:00:00 --mem-per-cpu=4700M -n 20 -N 1 | grep "[0-9]" | cut -d\ -f4) +echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/encode_test_tasks_run.sh b/encode-wrapper/encode_test_tasks_run.sh index 17c1447..e8e3b46 100755 --- a/encode-wrapper/encode_test_tasks_run.sh +++ b/encode-wrapper/encode_test_tasks_run.sh @@ -12,7 +12,7 @@ mkdir -p $testsOut || true cd $BASE/chip-seq-pipeline2/test/test_task echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" -for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do +for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do echo "# started: $t $(date)" $H/testrun_tasks.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND echo "# end: $t $(date) $?" diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index e292873..cf5b99b 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -18,9 +18,9 @@ By default it will use git over http. If you want to use ssh, then pass `ssh` as Run `python chip.py -get` to get IHEC ChIP test data for MCF10A cell line. -## Running on cluster +## Running on cluster (Compute Canada included) -For running on cluster with a SLURM or PBS etc see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) +For running on cluster with scheduler like SLURM or PBS, and Compute Canada details see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). ## Memory requirements @@ -90,9 +90,11 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ /!\ There are two scripts for running the pipeline for two specific use cases: - `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. -- `piperunner.sh` for either a Local use or a HPC use. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded. For Compute Canada users you can add `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc` and then do `module load singularity/3.5 mugqic/java/openjdk-jdk1.8.0_72 mugqic/python/3.7.3`. Usage: `./piperunner.sh input.json backend output_dir` with backend being either Local, singularity, slurm_singularity or pbs_singularity and the output_dir behave the same as for `singularity_wrapper.sh`. + +- `computecanada_wrapper.sh` for Compute Canada users. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded by adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. Usage: `./computecanada_wrapper.sh path/to/piperunner.sh input.json Local output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. -To use custom ressources you can add to your input.json file specific sections. For Compute Canada Beluga users the file `compute_canada_beluga_ressources.json` is already defined; you can refer to this one for other HPCs. +To use custom ressources you can add to your input.json file specific sections. For Compute Canada users the file `compute_canada_ressources.json` is already defined; you can refer to this one for other HPCs. +To merge the ressources.json and the input.json: `jq -s '.[0] * .[1]' input.json ressources.json > output_merged.json` ### ENCODE tests @@ -139,6 +141,8 @@ You can also use SLURM or PBS with the pipeline; please see [cluster](https://gi `./piperunner.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` or replacing slurm_singularity by pbs_singularity for pbs HPCs. +For Compute Canada users: `./slurm_wrapper.sh piperunner.sh cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./slurm_wrapper.sh piperunner.sh cemt0007_h3k27me3.json slurm_singularity h3k27me3_out`. + The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. For these tests, the running time can be 24 hours depending on hardware. @@ -194,5 +198,17 @@ The recommended workflow is to consider removing files from `delete.list` only ( ## Running on cluster -While the slurm_backend as defined by the encode pipeline will/should work; however, it's recommended that to run analysis on the cluster using slurm (or alternatives) just submit the a shell script containing the `./singularity_wrapper.sh $config` command. This means the entire job will run inside the container on one node on the cluster (i.e. the job will run in Local mode on the node it's submitted to). Using `slurm_singularity` backends (see [ENCODE documentation](https://encode-dcc.github.io/wdl-pipelines/install.html)) will mean cromwell will run on the head node (or where ever the job was launched from), and it will manage farming out each individual task to the cluster, with each task run in its own instance of singularity. +While the slurm_backend as defined by the encode pipeline will/should work; however, it's recommended to run analysis on the cluster using slurm (or alternatives) just submit a shell script containing the `./singularity_wrapper.sh $config` command. This means the entire job will run inside the container on one node on the cluster (i.e. the job will run in Local mode on the node it's submitted to). Using `slurm_singularity` backends (see [ENCODE documentation](https://encode-dcc.github.io/wdl-pipelines/install.html)) will mean cromwell will run on the head node (or where ever the job was launched from), and it will manage farming out each individual task to the cluster, with each task run in its own instance of singularity. + +### Compute Canada + +If you are a Compute Canada user you can customize ressources for different steps by using the file compute_canada_ressources.json. +To merge the ressources.json and the input.json: `jq -s '.[0] * .[1]' input.json compute_canada_beluga_ressources.json > output_merged.json` + +To setup the pipeline you need to do the following: +- Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. +- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh piperunner.sh input.json Local output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 cpu and 4700M of RAM per cpu (half a full node on Beluga), it can be customized to fit the user needs. + +To do ENCODE testing run: `./computecanada_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. +To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh piperunner.sh ./v2/ihec/cemt0007_h3k4me3.json Local h3k4me3_out` and `./computecanada_wrapper.sh piperunner.sh ./v2/ihec/cemt0007_h3k27me3.json Local h3k27me3_out`, then follow the standard procedure for checking results. \ No newline at end of file diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index 4d379f9..619fe41 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -2,7 +2,7 @@ jobFile=$1 BACKEND=$2 -if [[ $# -eq 3 ]]; then +if [[ $# -eq 3 ]]; then OUTDIR="-Dbackend.providers.$BACKEND.config.root=$3" else OUTDIR="" From 1a9a13fc78bc0eea39936ef8619138e84ffb0a73 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Fri, 9 Apr 2021 16:29:20 -0400 Subject: [PATCH 39/45] Debug --- .../computecanada_encode_test_tasks.sh | 4 ++-- encode-wrapper/computecanada_wrapper.sh | 2 +- encode-wrapper/readme.md | 20 ++++++------------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/encode-wrapper/computecanada_encode_test_tasks.sh b/encode-wrapper/computecanada_encode_test_tasks.sh index ae11736..98aa97e 100644 --- a/encode-wrapper/computecanada_encode_test_tasks.sh +++ b/encode-wrapper/computecanada_encode_test_tasks.sh @@ -5,7 +5,7 @@ OUTPUTDIR=$1 JOB_OUTPUT=encode_test.$OUTPUTDIR.log JOB_NAME=encode_test.$OUTPUTDIR cat /dev/null > $JOB_OUTPUT -cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash ./singularity_encode_test_tasks.sh $1" +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2" current_JOBID=$(echo "#!/bin/bash -$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=01:00:00 --mem-per-cpu=4700M -n 5 -N 1 | grep "[0-9]" | cut -d\ -f4) +$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=02:00:00 --mem-per-cpu=4700M -n 10 -N 1 | grep "[0-9]" | cut -d\ -f4) echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/computecanada_wrapper.sh b/encode-wrapper/computecanada_wrapper.sh index af8e721..8b7b4a6 100644 --- a/encode-wrapper/computecanada_wrapper.sh +++ b/encode-wrapper/computecanada_wrapper.sh @@ -15,7 +15,7 @@ filename="${filename%.*}" JOB_OUTPUT=chipseq.$filename.log JOB_NAME=chipseq.$filename cat /dev/null > $JOB_OUTPUT -cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2 $3 $4" +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2 $3" current_JOBID=$(echo "#!/bin/bash $cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=48:00:00 --mem-per-cpu=4700M -n 20 -N 1 | grep "[0-9]" | cut -d\ -f4) echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index cf5b99b..1765c8f 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -46,16 +46,8 @@ This command will write: * piperunner.sh -* piperunner_ihec_slurm_singularity.sh - -* piperunner_ihec_pbs_singularity.sh - * testrun_tasks.sh -* testrun_tasks_ihec_slurm_singularity.sh - -* testrun_tasks_ihec_pbs_singularity.sh - * singularity_encode_test_tasks.sh * singularity_wrapper.sh @@ -137,11 +129,11 @@ IHEC tests on Local mode can be run with: `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` -You can also use SLURM or PBS with the pipeline; please see [cluster](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) section. It's recommended that `singularity_runner.sh` is used instead for simplicity. + -For Compute Canada users: `./slurm_wrapper.sh piperunner.sh cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./slurm_wrapper.sh piperunner.sh cemt0007_h3k27me3.json slurm_singularity h3k27me3_out`. +For Compute Canada users: `./slurm_wrapper.sh piperunner.sh cemt0007_h3k4me3.json Local h3k4me3_out` and `./slurm_wrapper.sh piperunner.sh cemt0007_h3k27me3.json Local h3k27me3_out`. The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. @@ -207,8 +199,8 @@ To merge the ressources.json and the input.json: `jq -s '.[0] * .[1]' input.json To setup the pipeline you need to do the following: - Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. -- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh piperunner.sh input.json Local output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 cpu and 4700M of RAM per cpu (half a full node on Beluga), it can be customized to fit the user needs. +- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh singularity_wrapper.sh input.json output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 cpu and 4700M of RAM per cpu (half a full node on Beluga), it can be customized to fit the user needs. -To do ENCODE testing run: `./computecanada_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. +To do ENCODE testing run: `./computecanada_encode_test_tasks.sh singularity_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. -To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh piperunner.sh ./v2/ihec/cemt0007_h3k4me3.json Local h3k4me3_out` and `./computecanada_wrapper.sh piperunner.sh ./v2/ihec/cemt0007_h3k27me3.json Local h3k27me3_out`, then follow the standard procedure for checking results. \ No newline at end of file +To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results. \ No newline at end of file From b7ade1a762858c7927ccf351f310e205eda96c28 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 9 Jun 2021 14:53:55 -0400 Subject: [PATCH 40/45] Updating md5s for new singularity image --- encode-wrapper/computemd5s.py | 104 ++++++++++----------- encode-wrapper/expected_md5s_h3k27me3.json | 26 +++--- encode-wrapper/expected_md5s_h3k4me3.json | 26 +++--- 3 files changed, 76 insertions(+), 80 deletions(-) diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py index bc57b76..3d524fc 100644 --- a/encode-wrapper/computemd5s.py +++ b/encode-wrapper/computemd5s.py @@ -5,78 +5,74 @@ def findfiles(base, pattern): - cmd = "find {0} -name '{1}'".format(base, pattern) - print2(cmd) - p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - return [e.decode("utf-8").strip() for e in p.stdout.readlines()] + cmd = "find {0} -name '{1}'".format(base, pattern) + print2(cmd) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return [e.decode("utf-8").strip() for e in p.stdout.readlines()] def flistsize(fs): - return {e : os.stat(e).st_size for e in fs} + return {e : os.stat(e).st_size for e in fs} def byino(fs): - hashed = dict() - for e in fs: - ino = os.stat(e).st_ino - if not ino in hashed: hashed[ino] = list() - hashed[ino].append(e) - hashed2 = dict() - full_flist = dict() - for k,v in hashed.items(): - sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) - hashed2[k] = sortedfiles[0] - assert not sortedfiles[0] in full_flist - full_flist[sortedfiles[0]] = sortedfiles[1:] - return (hashed2, full_flist) + hashed = dict() + for e in fs: + ino = os.stat(e).st_ino + if not ino in hashed: hashed[ino] = list() + hashed[ino].append(e) + hashed2 = dict() + full_flist = dict() + for k,v in hashed.items(): + sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) + hashed2[k] = sortedfiles[0] + assert not sortedfiles[0] in full_flist + full_flist[sortedfiles[0]] = sortedfiles[1:] + return (hashed2, full_flist) def md5script(hashed): - def cmd(f): - if f.strip().endswith('bam'): - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) - else: - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif md5sum {0})"'.format(f, os.path.basename(f)) - - return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] + def cmd(f): + if f.strip().endswith('bam'): + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0.7.1-rev1.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) + else: + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0.7.1-rev1.sif md5sum {0})"'.format(f, os.path.basename(f)) + + return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] def trackoutput(base, i, filereport): - logerr('# looking in {0}\n'.format(base)) - bams = findfiles(base, '*.bam') - narrowpeaks = findfiles(base, '*narrow*gz') - (bamsbyino, bams_flist) = byino(bams) - (peaksbyino, peaks_flist) = byino(narrowpeaks) + logerr('# looking in {0}\n'.format(base)) + bams = findfiles(base, '*.bam') + narrowpeaks = findfiles(base, '*narrow*gz') + (bamsbyino, bams_flist) = byino(bams) + (peaksbyino, peaks_flist) = byino(narrowpeaks) + + if not filereport: + print2(writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino))) - if not filereport: - print2(writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino))) - - qc = findfiles(base, 'qc.html') - print2(qc) + qc = findfiles(base, 'qc.html') + print2(qc) - return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1]} + return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1]} def main(args): - assert len(args) == 2, '__only_one_target_directory_at_a_time__' - [targets, tag] = args #['.'] if len(args) == 0 else args - output = list() - keep = list() - - for i, arg in enumerate([targets]): - record = trackoutput(arg, tag, False) - output.append(record) - keep.extend(record['bams']) - keep.extend(record['peaks']) - - print2(jdumpf('./filereport.json', output)) - print2(jdumpf('./file_shortreport.json', list(map(lambda o: {k: sorted(o[k].keys()) for k in o}, output)))) - print2('size', sum(flistsize(keep).values())) - - + assert len(args) == 2, '__only_one_target_directory_at_a_time__' + [targets, tag] = args #['.'] if len(args) == 0 else args + output = list() + keep = list() -if __name__ == '__main__': - main(sys.argv[1:]) + for i, arg in enumerate([targets]): + record = trackoutput(arg, tag, False) + output.append(record) + keep.extend(record['bams']) + keep.extend(record['peaks']) + print2(jdumpf('./filereport.json', output)) + print2(jdumpf('./file_shortreport.json', list(map(lambda o: {k: sorted(o[k].keys()) for k in o}, output)))) + print2('size', sum(flistsize(keep).values())) +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/encode-wrapper/expected_md5s_h3k27me3.json b/encode-wrapper/expected_md5s_h3k27me3.json index 5b189bc..9d95679 100644 --- a/encode-wrapper/expected_md5s_h3k27me3.json +++ b/encode-wrapper/expected_md5s_h3k27me3.json @@ -1,21 +1,21 @@ { "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "018ad8f5f3158534320ed359563878d3" + "bcb0870d7fb36b19aa6e5bcfb15d5074" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "defd886ab7923b952e04ee033a722fac" + "a40596e82a52219105683fe6b2296414" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "b1ae4fb3f2b68b3c8346c57fa04f476f" + "8ebf1f3415468180c06f2845eb8b7ea1" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "0f38658b68706ec12b5faded1141750e" + "c2647c48949a708b170ce43a16542974" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "b1ac6ab70d053b546f186080639252ed" + "1c8896e5a37dd0ffd85819c158ac2473" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "1c9554fe8b67e61fd7c69a1881ec2e3a" + "83330d3dbd5b984e363bbb1f91c7af20" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ "55de2037c6657d1027fb6b625822fa8b" @@ -27,24 +27,24 @@ "7a52f55148b47e2a48fac330e3672c96" ], "conservative_peak.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "conservative_peak.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "optimal_peak.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "optimal_peak.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "rep1-pr.overlap.bfilt.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "rep1-pr.overlap.bfilt.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "rep1-pr.overlap.narrowPeak.gz": [ - "a896c1ec4693ddbd2e098ffa901c1f2a" + "4be7f27238a6f2818d052546ea7508ed" ] } \ No newline at end of file diff --git a/encode-wrapper/expected_md5s_h3k4me3.json b/encode-wrapper/expected_md5s_h3k4me3.json index 472cb8f..0354405 100644 --- a/encode-wrapper/expected_md5s_h3k4me3.json +++ b/encode-wrapper/expected_md5s_h3k4me3.json @@ -1,21 +1,21 @@ { "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "11df65c690dbde63772231167cc3e3c6" + "2b5b6d36b1ffc7312e975995665454a5" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "67d499fe7a0f5442f0c2fe943599b6bd" + "ff7c420e651a8eaadc5d1728ed7748cf" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "ce55328fad51c032eb6e532ebcc5a7ee" + "d160f97b59e457cef66f4646350287a1" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "76f750f47ac517d611c508dfd7de30e9" + "b1bf7417e9482329c7aa0052bc4cdc14" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "157ece5658aa3e84aa036ce59c54d839" + "1754bfa54f29838274e3d89070a91a0c" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "12d46443a1470ec7fe4ea2b308aadde0" + "22e3b8448788c81e80cff2224a0516fa" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ "0b624f273c1114314fd5c484580f28f8" @@ -27,24 +27,24 @@ "c9661487f0a63f3a59dc76c91bb58550" ], "conservative_peak.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "conservative_peak.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "optimal_peak.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "optimal_peak.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "rep1-pr.overlap.bfilt.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "rep1-pr.overlap.bfilt.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "rep1-pr.overlap.narrowPeak.gz": [ - "8171d3af09f6ba3e69e995155ba8658f" + "fd19d314a10c6b41e0920332df678c7a" ] } \ No newline at end of file From e6f44bac99e65208e6fab1958a0da04ee125544d Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 9 Jun 2021 15:02:25 -0400 Subject: [PATCH 41/45] Adding execute permission for compute canada launchers --- encode-wrapper/computecanada_encode_test_tasks.sh | 0 encode-wrapper/computecanada_wrapper.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 encode-wrapper/computecanada_encode_test_tasks.sh mode change 100644 => 100755 encode-wrapper/computecanada_wrapper.sh diff --git a/encode-wrapper/computecanada_encode_test_tasks.sh b/encode-wrapper/computecanada_encode_test_tasks.sh old mode 100644 new mode 100755 diff --git a/encode-wrapper/computecanada_wrapper.sh b/encode-wrapper/computecanada_wrapper.sh old mode 100644 new mode 100755 From 40f9aeb5d43886a74f824d9d2c48bbf593dfb702 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 9 Jun 2021 15:46:26 -0400 Subject: [PATCH 42/45] Updating new singularity image --- encode-wrapper/computemd5s.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py index 3d524fc..fc78025 100644 --- a/encode-wrapper/computemd5s.py +++ b/encode-wrapper/computemd5s.py @@ -32,9 +32,9 @@ def byino(fs): def md5script(hashed): def cmd(f): if f.strip().endswith('bam'): - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0.7.1-rev1.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0_7_1-rev1.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) else: - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0.7.1-rev1.sif md5sum {0})"'.format(f, os.path.basename(f)) + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0_7_1-rev1.sif md5sum {0})"'.format(f, os.path.basename(f)) return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] From 4d43c3d08683b418393d50bca0ddb26fea23d17e Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 9 Jun 2021 16:01:25 -0400 Subject: [PATCH 43/45] Updating documentation --- ...anada_ressources.json => computecanada_resources.json} | 0 encode-wrapper/ihec_standard_workflow.md | 2 +- encode-wrapper/readme.md | 8 ++++---- 3 files changed, 5 insertions(+), 5 deletions(-) rename encode-wrapper/{computecanada_ressources.json => computecanada_resources.json} (100%) diff --git a/encode-wrapper/computecanada_ressources.json b/encode-wrapper/computecanada_resources.json similarity index 100% rename from encode-wrapper/computecanada_ressources.json rename to encode-wrapper/computecanada_resources.json diff --git a/encode-wrapper/ihec_standard_workflow.md b/encode-wrapper/ihec_standard_workflow.md index c72e0a5..6c51557 100644 --- a/encode-wrapper/ihec_standard_workflow.md +++ b/encode-wrapper/ihec_standard_workflow.md @@ -1,4 +1,4 @@ -# IHEC ChIP-Seq standard workdlows +# IHEC ChIP-Seq standard workdfows See the ENCODE reference for input format: https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/docs/input.md diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 1765c8f..92e6461 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -85,8 +85,8 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ - `computecanada_wrapper.sh` for Compute Canada users. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded by adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. Usage: `./computecanada_wrapper.sh path/to/piperunner.sh input.json Local output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. -To use custom ressources you can add to your input.json file specific sections. For Compute Canada users the file `compute_canada_ressources.json` is already defined; you can refer to this one for other HPCs. -To merge the ressources.json and the input.json: `jq -s '.[0] * .[1]' input.json ressources.json > output_merged.json` +To use custom resources you can add to your input.json file specific sections. For Compute Canada users the file `compute_canada_resources.json` is already defined; you can refer to this one for other HPCs. +To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json resources.json > output_merged.json` ### ENCODE tests @@ -194,8 +194,8 @@ While the slurm_backend as defined by the encode pipeline will/should work; howe ### Compute Canada -If you are a Compute Canada user you can customize ressources for different steps by using the file compute_canada_ressources.json. -To merge the ressources.json and the input.json: `jq -s '.[0] * .[1]' input.json compute_canada_beluga_ressources.json > output_merged.json` +If you are a Compute Canada user you can customize resources for different steps by using the file compute_canada_resources.json. +To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json computecanada_resources.json > output_merged.json` To setup the pipeline you need to do the following: - Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. From 5ab13d0d8e70d81e913f5446f8a8eb26113e8984 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 9 Jun 2021 16:27:50 -0400 Subject: [PATCH 44/45] Updating documentation --- encode-wrapper/readme.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 92e6461..17fbad2 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -32,7 +32,7 @@ The analysis will generate a `qc.json` file (as well as an html version) along w ## Pulling Singularity image and generating wrapper scripts -These scripts require `python 3.6.8` or higher. It's assmumed that the the underlying OS supports `overlayfs` so paths that do not exist on the singularity can be mounted inside singularity (CentOS7 should work fine). CentOS6 does not have `overlayfs` support. If you need support for OS without `overlayfs` please make an issue. +These scripts require `python 3.6.8` or higher. It's assumed that the the underlying OS supports `overlayfs` so paths that do not exist on the singularity can be mounted inside singularity (CentOS7 should work fine). CentOS6 does not have `overlayfs` support. If you need support for OS without `overlayfs` please make an issue. Check singularity version with `singularity --version` to make sure it's at least `3.0.1`. @@ -80,19 +80,19 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ## Running tests -/!\ There are two scripts for running the pipeline for two specific use cases: -- `singularity_wrapper.sh` for a Local run only with a wrapper encapsulating the call of the pipeline inside the singularity image. Usage: `./singularity_wrapper.sh input.json output_dir` with output_dir optional, if not provided the output will be in the working directory. + -- `computecanada_wrapper.sh` for Compute Canada users. However you need to ensure to have a python3, a java and a Singularity version 3 or above loaded by adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. Usage: `./computecanada_wrapper.sh path/to/piperunner.sh input.json Local output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. + -To use custom resources you can add to your input.json file specific sections. For Compute Canada users the file `compute_canada_resources.json` is already defined; you can refer to this one for other HPCs. -To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json resources.json > output_merged.json` + ### ENCODE tests -To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. For testing in Compute Canada see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). -You will need atleast 10G of memory for running the encode test tasks. +You will need at least 10G of memory for running the encode test tasks. Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. @@ -133,15 +133,15 @@ IHEC tests on Local mode can be run with: `./piperunner.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` or replacing slurm_singularity by pbs_singularity for pbs HPCs. --> -For Compute Canada users: `./slurm_wrapper.sh piperunner.sh cemt0007_h3k4me3.json Local h3k4me3_out` and `./slurm_wrapper.sh piperunner.sh cemt0007_h3k27me3.json Local h3k27me3_out`. +For testing in Compute Canada see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. For these tests, the running time can be 24 hours depending on hardware. -To compute md5s of generated file, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without the bam header as that may contain full paths names. +To compute md5s of generated files, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without the bam header as that may contain full paths names. -As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `outdir=$PWD/cromwell-executions/chip/93de85aa-d581-48df-b8ae-a91a6e88a21f`. So do +As an example, suppose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `outdir=$PWD/cromwell-executions/chip/93de85aa-d581-48df-b8ae-a91a6e88a21f`. So do python computemd5s.py $outdir test # the first must be the cromwell directory for the analysis, the second a suffix for the script chmod +x ./computemd5s_test @@ -181,11 +181,11 @@ See output of `./trackoutput.sh -outdir:$outdi ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files -Cromwell generates large number of files by creating hardlinks; this script attempts to resolve these links and keeping only one copy of each. `./unresolvedfiles.list` contains hardlinks that the script is unable to resolve because of mount issues or othet OS errors. +Cromwell generates large number of files by creating hardlinks; this script attempts to resolve these links and keeping only one copy of each. `./unresolvedfiles.list` contains hardlinks that the script is unable to resolve because of mount issues or other OS errors. It's expected that `unresolvedfiles.list` and `unexpectedfiles.list` are empty. If they are not empty, the files listed there will need to be looked at. Please review files before deleting to ensure nothing useful is removed. -The recommended workflow is to consider removing files from `delete.list` only (in case diskspace is an issue). And then symlink files from masterfiles.list (while keeping everything else) to a final analysis directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. +The recommended workflow is to consider removing files from `delete.list` only (in case disk-space is an issue). And then symlink files from masterfiles.list (while keeping everything else) to a final analysis directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. ## Running on cluster @@ -199,8 +199,8 @@ To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json To setup the pipeline you need to do the following: - Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. -- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh singularity_wrapper.sh input.json output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 cpu and 4700M of RAM per cpu (half a full node on Beluga), it can be customized to fit the user needs. +- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh singularity_wrapper.sh input.json output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 CPUs and 4700M of RAM per CPU (half a full node on Beluga), it can be customized to fit the user needs. To do ENCODE testing run: `./computecanada_encode_test_tasks.sh singularity_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. -To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results. \ No newline at end of file +To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results. From f9bba756cab5bf248b9245e9ff9ec4abee1b41a7 Mon Sep 17 00:00:00 2001 From: Paul Stretenowich Date: Wed, 7 Jul 2021 11:45:53 -0400 Subject: [PATCH 45/45] Updating documentation --- encode-wrapper/readme.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 17fbad2..abe4af0 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -198,9 +198,11 @@ If you are a Compute Canada user you can customize resources for different steps To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json computecanada_resources.json > output_merged.json` To setup the pipeline you need to do the following: +- Load singularity by doing `module load singularity/3.7` and setup the default folder for pulling the image `mkdir -p /localscratch/$USER ; export SINGULARITY_TMPDIR=/localscratch/$USER` - Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. +- Pulling all the ressources needed for running the tests `./get_encode_resources.sh && python chip.py -get -pullimage -bindpwd -maketests && chmod +x *.sh` - Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh singularity_wrapper.sh input.json output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 CPUs and 4700M of RAM per CPU (half a full node on Beluga), it can be customized to fit the user needs. To do ENCODE testing run: `./computecanada_encode_test_tasks.sh singularity_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. -To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results. +To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results (the checking scripts need sambamba, you can do `module load mugqic/sambamba` to have it available in your environment).