diff --git a/encode-wrapper/backend.conf b/encode-wrapper/backend.conf index 874607c..ecba615 100644 --- a/encode-wrapper/backend.conf +++ b/encode-wrapper/backend.conf @@ -23,6 +23,8 @@ backend { submit = """ ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (sbatch \ --export=ALL \ + --mail-type=END,FAIL --mail-user=$JOB_MAIL \ + -A $RAP_ID \ -J ${job_name} \ -D ${cwd} \ -o ${out} \ @@ -36,7 +38,7 @@ backend { ${"--account " + slurm_account} \ ${"--gres gpu:" + gpu} \ ${slurm_extra_param} \ - --wrap "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") + --wrap "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}") """ kill = "scancel ${job_id}" check-alive = "squeue -j ${job_id}" @@ -44,44 +46,34 @@ backend { } } - sge_singularity { + pbs_singularity { actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" config { - script-epilogue = "sleep 30 && sync" + script-epilogue = "sleep 30" concurrent-job-limit = 50 runtime-attributes = """ - String sge_pe = "shm" Int cpu = 1 Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param + Int time = 1 + Int memory_mb = 1024 String singularity_container String? singularity_bindpath """ submit = """ - ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ - -S /bin/sh \ - -terse \ - -b n \ + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ -N ${job_name} \ - -wd ${cwd} \ -o ${out} \ -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \ - ${"-l h_vmem=" + memory_mb/cpu + "m"} \ - ${"-l s_vmem=" + memory_mb/cpu + "m"} \ - ${"-l h_rt=" + time*3600} \ - ${"-l s_rt=" + time*3600} \ - ${"-q " + sge_queue} \ - ${"-l gpu=" + gpu} \ - ${sge_extra_param} \ - -V) + -l nodes=1:ppn=${cpu} \ + -l mem=${memory_mb}MB \ + -l walltime=${time}:0:0 \ + ${if gpu>1 then "-lngpus=" + gpu else ""} \ + -V + ) """ kill = "qdel ${job_id}" check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" + job-id-regex = "(\\d+).*" } } @@ -96,7 +88,7 @@ backend { String? singularity_bindpath """ submit = """ - ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script} & echo $! && disown) + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script} & echo $! && disown) """ job-id-regex = "(\\d+)" check-alive = "ps -ef | grep -v grep | grep ${job_id}" @@ -110,112 +102,6 @@ backend { concurrent-job-limit = 10 } } - - sge { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - String sge_pe = "shm" - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param - """ - submit = """ - qsub \ - -S /bin/sh \ - -terse \ - -b n \ - -N ${job_name} \ - -wd ${cwd} \ - -o ${out} \ - -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " + cpu else " "} \ - ${"-l h_vmem=" + memory_mb/cpu + "m"} \ - ${"-l s_vmem=" + memory_mb/cpu + "m"} \ - ${"-l h_rt=" + time*3600} \ - ${"-l s_rt=" + time*3600} \ - ${"-q " + sge_queue} \ - ${"-l gpu=" + gpu} \ - ${sge_extra_param} \ - -V \ - ${script} - """ - kill = "qdel ${job_id}" - check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" - } - } - - slurm { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? slurm_partition - String? slurm_account - String? slurm_extra_param - """ - submit = """ - sbatch \ - --export=ALL \ - -J ${job_name} \ - -D ${cwd} \ - -o ${out} \ - -e ${err} \ - ${"-t " + time*60} \ - -n 1 \ - --ntasks-per-node=1 \ - ${"--cpus-per-task=" + cpu} \ - ${"--mem=" + memory_mb} \ - ${"-p " + slurm_partition} \ - ${"--account " + slurm_account} \ - ${"--gres gpu:" + gpu} \ - ${slurm_extra_param} \ - --wrap "/bin/bash ${script}" - """ - kill = "scancel ${job_id}" - check-alive = "squeue -j ${job_id}" - job-id-regex = "Submitted batch job (\\d+).*" - } - } - - google { - actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory" - config { - # Google project - project = "your-project-name" - - # Base bucket for workflow executions - root = "gs://your-bucket-name" - - concurrent-job-limit = 1000 - genomics-api-queries-per-100-seconds = 1000 - maximum-polling-interval = 600 - - genomics { - auth = "application-default" - compute-service-account = "default" - endpoint-url = "https://genomics.googleapis.com/" - restrict-metadata-access = false - } - - filesystems { - gcs { - auth = "application-default" - } - } - } - } } } @@ -238,13 +124,3 @@ call-caching { enabled = false invalidate-bad-cache-results = true } - -google { - application-name = "cromwell" - auths = [ - { - name = "application-default" - scheme = "application_default" - } - ] -} diff --git a/encode-wrapper/backend_ihec_pbs_singularity.conf b/encode-wrapper/backend_ihec_pbs_singularity.conf new file mode 100644 index 0000000..047a39d --- /dev/null +++ b/encode-wrapper/backend_ihec_pbs_singularity.conf @@ -0,0 +1,65 @@ +include required(classpath("application")) + +backend { + default = "Local" + providers { + + pbs_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int time = 1 + Int memory_mb = 1024 + String singularity_container + String? singularity_bindpath + """ + submit = """ + ls ${singularity_container} $(echo ${singularity_bindpath} | tr , ' ') 1>/dev/null && (echo "chmod u+x ${script} && SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1),${singularity_bindpath} singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} ${script}" | qsub \ + -N ${job_name} \ + -o ${out} \ + -e ${err} \ + -l nodes=1:ppn=${cpu} \ + -l mem=${memory_mb}MB \ + -l walltime=${time}:0:0 \ + ${if gpu>1 then "-lngpus=" + gpu else ""} \ + -V + ) + """ + kill = "qdel ${job_id}" + check-alive = "qstat -j ${job_id}" + job-id-regex = "(\\d+).*" + } + } + + Local { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + concurrent-job-limit = 10 + } + } + } +} + +services { + LoadController { + class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" + config { + # disable it (for login nodes on Stanford SCG, Sherlock) + control-frequency = 21474834 seconds + } + } +} + +system { + abort-jobs-on-terminate = true + graceful-server-shutdown = true +} + +call-caching { + enabled = false + invalidate-bad-cache-results = true +} diff --git a/encode-wrapper/chip.py b/encode-wrapper/chip.py index b773538..79384fb 100644 --- a/encode-wrapper/chip.py +++ b/encode-wrapper/chip.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import subprocess import sys @@ -14,247 +16,239 @@ debug_mode = False def base(): - return os.path.dirname(os.path.realpath(__file__)) + return os.path.dirname(os.path.realpath(__file__)) def wget(url, debug=debug_mode): - logerr('getting: {}\n'.format(url)) - if debug: - logerr(' ..debug: wget {0}\n'.format(url)) - dumpf(os.path.basename(url), 'test:{0}'.format(url)) - return - p = subprocess.Popen('wget ' + url ,shell=True) - return p.wait() - - - + logerr('getting: {}\n'.format(url)) + if debug: + logerr(' ..debug: wget {0}\n'.format(url)) + dumpf(os.path.basename(url), 'test:{0}'.format(url)) + return + p = subprocess.Popen('wget ' + url ,shell=True) + return p.wait() + + def get_hg38_resources(home): - base = os.path.abspath(os.getcwd()) - mkdirs('hg38_resources/genome_hg38/bwa_index') - os.chdir('./hg38_resources') - for f in [ - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.blacklist.bed.gz', - 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.chrom.sizes', - ]: - wget(f) - movefile('hg38.blacklist.bed.gz', './genome_hg38/') - movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', './genome_hg38/') - movefile('hg38.chrom.sizes', './genome_hg38/') - movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', './genome_hg38/bwa_index/') - config = '\n'.join([ - "blacklist {0}/genome_hg38/hg38.blacklist.bed.gz", - "chrsz {0}/genome_hg38/hg38.chrom.sizes", - "gensz hs", - "bowtie2_idx_tar /dev/null", - "bwa_idx_tar {0}/genome_hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", - "ref_fa {0}/genome_hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta", - ]).format(base) + '\n' - logerr( dumpf('./hg38_local.tsv', config) + '\n' ) - base_config = jdumpf('./base_config.json', { 'chip.genome_tsv' : os.path.abspath('./hg38_local.tsv'), 'base' : base }) - os.chdir(home) - return base_config + base = os.path.abspath(os.getcwd()) + mkdirs('hg38_resources/genome_hg38/bwa_index') + os.chdir('./hg38_resources') + for f in [ + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.blacklist.bed.gz', + 'http://www.epigenomes.ca/data/CEMT/resources/chip_v2/hg38.chrom.sizes', + ]: + wget(f) + movefile('hg38.blacklist.bed.gz', './genome_hg38/') + movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta', './genome_hg38/') + movefile('hg38.chrom.sizes', './genome_hg38/') + movefile('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar', './genome_hg38/bwa_index/') + config = '\n'.join([ + "blacklist {0}/genome_hg38/hg38.blacklist.bed.gz", + "chrsz {0}/genome_hg38/hg38.chrom.sizes", + "gensz hs", + "bowtie2_idx_tar /dev/null", + "bwa_idx_tar {0}/genome_hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar", + "ref_fa {0}/genome_hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta", + ]).format(base) + '\n' + logerr(dumpf('./hg38_local.tsv', config) + '\n') + base_config = jdumpf('./base_config.json', {'chip.genome_tsv': os.path.abspath('./hg38_local.tsv'), 'base': base}) + os.chdir(home) + return base_config def existing_ref_config(configfile): - home = os.path.abspath(os.getcwd()) - config = jloadf(configfile) - hashed = { k : os.path.realpath(config[k]) if k in ['blacklist', 'chrsz', 'ref_fa', 'bwa_idx_tar'] else config[k] for k in config } - mkdirs('./hg38_resources') - os.chdir('./hg38_resources') - config = '\n'.join([e.strip() for e in '''blacklist {blacklist} - chrsz {chrsz} - gensz {gensz} - bowtie2_idx_tar {bowtie2_idx_tar} - bwa_idx_tar {bwa_idx_tar} - ref_fa {ref_fa}'''.format(**hashed).splitlines() ]) + '\n' - logerr( './hg38_resources/' + dumpf('./hg38_local.tsv', config) + '\n' ) - base_config = jdumpf('./base_config.json', { 'chip.genome_tsv' : os.path.abspath('./hg38_local.tsv'), 'base' : home }) - os.chdir(home) - return home + '/hg38_resources/base_config.json' + home = os.path.abspath(os.getcwd()) + config = jloadf(configfile) + hashed = {k: os.path.realpath(config[k]) if k in ['blacklist', 'chrsz', 'ref_fa', 'bwa_idx_tar'] else config[k] for k in config} + mkdirs('./hg38_resources') + os.chdir('./hg38_resources') + config = '\n'.join([e.strip() for e in '''blacklist {blacklist} + chrsz {chrsz} + gensz {gensz} + bowtie2_idx_tar {bowtie2_idx_tar} + bwa_idx_tar {bwa_idx_tar} + ref_fa {ref_fa}'''.format(**hashed).splitlines()]) + '\n' + logerr('./hg38_resources/' + dumpf('./hg38_local.tsv', config) + '\n') + base_config = jdumpf('./base_config.json', {'chip.genome_tsv': os.path.abspath('./hg38_local.tsv'), 'base': home}) + os.chdir(home) + return home + '/hg38_resources/base_config.json' def rm(target): - try: - shutil.rmtree(target) - except OSError as e: - logerr("# error: {0} / {1}".format(target, e.strerror)) + try: + shutil.rmtree(target) + except OSError as e: + logerr("# error: {0} / {1}".format(target, e.strerror)) def get_test_data(configfile, home): - config = jloadf(configfile) - os.chdir('./v2/ihec/test_data') - oks = dict() - for k in config['data']: - oks[k] = False - for url in config['data'][k]: - if wget(url) == 0: - oks[k] = True - break - else: - logerr('# failed downloading:' + url) - incomplete = glob.glob('./' + os.path.basename(url)) - if len(incomplete) > 0: - assert len(incomplete) == 1, incomplete - shutil.remove(incomplete[0]) - logerr('# removed failed download.. ' + incomplete[0]) - os.chdir(home) - for k in oks: - assert oks[k], ['could not download all test data', k] + config = jloadf(configfile) + os.chdir('./v2/ihec/test_data') + oks = dict() + for k in config['data']: + oks[k] = False + for url in config['data'][k]: + if wget(url) == 0: + oks[k] = True + break + else: + logerr('# failed downloading:' + url) + incomplete = glob.glob('./' + os.path.basename(url)) + if len(incomplete) > 0: + assert len(incomplete) == 1, incomplete + shutil.remove(incomplete[0]) + logerr('# removed failed download.. ' + incomplete[0]) + os.chdir(home) + for k in oks: + assert oks[k], ['could not download all test data', k] def make_tests(args): - mcf10a = ['cemt0007_h3k4me3_template.json', 'cemt0007_h3k27me3_template.json'] - - if os.path.isfile('./hg38_resources/base_config.json'): - config = jloadf('./hg38_resources/base_config.json') - base = config['base'] - else: - if '-pwd2ext0' in args : raise Exception('-pwd2ext0 is no longer supported') - base = os.path.abspath(os.getcwd()) - - def fix(fname, base): - assert fname.endswith('_template.json') - out = './v2/ihec/{0}.json'.format(fname[0:-len('_template.json')]) - config = jloadf(fname) - return dumpf(out, jsonp(config).replace('{0}', base)) - - for f in mcf10a: - print2('written:', fix(f, base)) - -def write_testrun(l_config): - for i in range(len(l_config)):# config in l_config: - if i == 0: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - encode_tests = [ - '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', - '\n{additional_binds}\n\nsingularity exec --cleanenv $BINDPATHS {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' - ] - logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**l_config[i]))) - mcf_tests = [ - '#!/bin/bash', 'echo "home:$PWD"', "which singularity", - 'if [[ $# > 1 ]]; then OUTDIR="$2"; else OUTDIR=""; fi', - 'BACKEND="{backend_default}"', - - '\n{additional_binds}\nsingularity exec --cleanenv $BINDPATHS {container_image} {home_mnt}/piperunner.sh $1 $BACKEND $OUTDIR\n\n' - ] - logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**l_config[i]))) - else: - with open('testrun_template.sh') as infile: - logerr('#written:' + dumpf('{0}/piperunner_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - with open('testrun_tasks_template.sh') as infile: - logerr('#written:' + dumpf('{0}/testrun_tasks_ihec_slurm_singularity.sh'.format(l_config[i]['home']), infile.read().format(**l_config[i])) + '\n') - - return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n{additional_binds}\n\nsingularity exec $BINDPATHS {container_image} python trackoutput.py $@\n\n'.format(**l_config[i])) + mcf10a = ['cemt0007_h3k4me3_template.json', 'cemt0007_h3k27me3_template.json'] + + if os.path.isfile('./hg38_resources/base_config.json'): + config = jloadf('./hg38_resources/base_config.json') + base = config['base'] + else: + base = '/mnt/ext_0' if '-pwd2ext0' in args else os.path.abspath(os.getcwd()) + + def fix(fname, base): + assert fname.endswith('_template.json') + out = './v2/ihec/{0}.json'.format(fname[0:-len('_template.json')]) + config = jloadf(fname) + return dumpf(out, jsonp(config).replace('{0}', base)) + + for f in mcf10a: + print2('written:', fix(f, base)) + +def write_testrun(config): + with open('testrun_template.sh') as infile: + logerr('#written:' + dumpf('{0}/piperunner.sh'.format(config['home']), infile.read().format(**config)) + '\n') + with open('testrun_tasks_template.sh') as infile: + logerr('#written:' + dumpf('{0}/testrun_tasks.sh'.format(config['home']), infile.read().format(**config)) + '\n') + + encode_tests = [ + '#!/bin/bash\n\necho "home:$PWD"\n\nwhich singularity', + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/encode_test_tasks_run.sh {home_mnt} Local ${{@:1}}\n\n' + ] + logerrn('#written:' + dumpf('./singularity_encode_test_tasks.sh', '\n'.join(encode_tests).format(**config))) + mcf_tests = [ + '#!/bin/bash', 'echo "home:$PWD"', "which singularity", + '\nsingularity exec --cleanenv {additional_binds} {container_image} {home_mnt}/piperunner.sh $1 Local $2\n\n' + ] + logerrn(dumpf('./singularity_wrapper.sh', '\n'.join(mcf_tests).format(**config))) + + return dumpf('./trackoutput.sh', '#!/bin/bash\n\necho "home:$PWD"\nwhich singularity\n\n\nsingularity exec {additional_binds} {container_image} python trackoutput.py $@\n\n'.format(**config)) def singularity_pull_image(home, config, binds, debug=debug_mode): - imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1-rev1' - image_version = imageurl.split(':')[-1].replace('.', '_') - os.chdir('./images') - if debug: - dumpf('./debug.img', 'test:{0}'.format('singularity')) - else: - cmd = 'singularity pull {0}'.format(imageurl) - logerr('# .. ' + cmd + '\n') - if not '-nobuild' in config: - shell(cmd, assert_ok = True) - - images = glob.glob('./*img') + glob.glob('./*.sif') - assert len(images) == 1, images - image_label = 'chip_seq_pipeline_{0}'.format(image_version) - image_ext = images[0].split('.')[-1] - image_name = '{0}.{1}'.format(image_label, image_ext) - logerr('# pulled image: {0}, moved: {1}\n'.format(images[0], image_name)) - os.rename(images[0], image_name) - image_path = os.path.abspath(image_name) - os.chdir(home) - home_mnt = "/mnt/ext_0" if '-pwd2ext0' in config else home - container_mnt = '{0}/v2/singularity_container.json'.format(home_mnt) - container = jdumpf('./v2/singularity_container.json', { - "default_runtime_attributes" : { - "singularity_container" : '{0}/images/{1}'.format(home_mnt, image_name) , - "singularity_instance_name": image_label - } - }) - binds_pwd="-B $PWD" - shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/v2/'.format(image_path, binds_pwd, "$PWD"), assert_ok=True) - shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/'.format(image_path, binds_pwd, "$PWD"), assert_ok=True) - if not os.path.exists('./chip.wdl') or not os.path.exists('./v2/chip.wdl'): - raise Exception('__could_not_copy__:chip.wdl likey current directory is not bound in the container... ' + binds) - logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') - logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') - - return [{ - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${3:-Local}", - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend.conf".format(home_mnt) - }, - { - 'additional_binds' : binds, - "container_image":image_path, - "home" : home, - "home_mnt": home_mnt, - "bind_opt": "${3:-}", - "backend_default" : "${3:-Local}", - - "container" : container_mnt, #os.path.abspath(container), - "wdl" : "{0}/v2/chip.wdl".format(home_mnt), - "backend" : "{0}/backend_ihec_slurm_singularity.conf".format(home_mnt) - }] + #imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.2' + imageurl = 'docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.4-sambamba-0.7.1-rev1' + image_version = imageurl.split(':')[-1].replace('.', '_') + os.chdir('./images') + if debug: + dumpf('./debug.img', 'test:{0}'.format('singularity')) + else: + cmd = 'singularity pull {0}'.format(imageurl) + logerr('# .. ' + cmd + '\n') + if not '-nobuild' in config: + shell(cmd, assert_ok=True) + + images = glob.glob('./*img') + glob.glob('./*.sif') + assert len(images) == 1, images + image_label = 'chip_seq_pipeline_{0}'.format(image_version) + image_ext = images[0].split('.')[-1] + image_name = '{0}.{1}'.format(image_label, image_ext) + logerr('# pulled image: {0}, moved: {1}\n'.format(images[0], image_name)) + os.rename(images[0], image_name) + image_path = os.path.abspath(image_name) + os.chdir(home) + home_mnt = "/mnt/ext_0" if '-pwd2ext0' in config else home + container_mnt = '{0}/v2/singularity_container.json'.format(home_mnt) + container = jdumpf('./v2/singularity_container.json', { + "default_runtime_attributes" : { + "singularity_container" : '{0}/images/{1}'.format(home_mnt, image_name), + "singularity_instance_name": image_label + } + }) + + shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/v2/'.format(image_path, binds, home_mnt), assert_ok=True) + shell('singularity exec {1} {0} cp /software/chip-seq-pipeline/chip.wdl {2}/'.format(image_path, binds, home_mnt), assert_ok=True) + if not os.path.exists('./chip.wdl') or not os.path.exists('./v2/chip.wdl'): + raise Exception('__could_not_copy__:chip.wdl likey current directory is not bound in the container... ' + binds) + logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./v2/chip.wdl\n') + logerr('# copied /software/chip-seq-pipeline/chip.wdl to ./chip.wdl\n') + return { + 'additional_binds' : binds, + "container_image":image_path, + "home" : home, + "home_mnt": home_mnt, + "bind_opt": "${3:-}", + "backend_default" : "${2:-Local}", + "container" : container_mnt, #os.path.abspath(container), + "wdl" : "{0}/v2/chip.wdl".format(home_mnt), + "backend" : "{0}/backend.conf".format(home_mnt) + } def bindargs(args): - binds = '''if [ -z "$BINDPATHS" ] ; then BINDPATHS="-B $PWD"; else BINDPATHS="-B $PWD,$BINDPATHS"; fi ; echo "# binding $BINDPATHS"; ''' - return binds + binds = '' + if not '-bindpwd' in args: + return binds + if '-bindpwd' in args: + params = [e for e in args if not e[0] == '-'] + if '-pwd2ext0'in args: + bindpwd = '-B {0}:/mnt/ext_0'.format(os.getcwd()) + offset = 1 + else: + bindpwd = '-B ' + os.getcwd() + offset = 1 + + if not params: + return bindpwd + else: + return bindpwd + ',' + ','.join([ '{1}:/mnt/ext_{0}'.format(i + offset, e) for i,e in enumerate(params)]) + return binds + def main(args): - home = base() - logerr('# prefix {0}\n'.format(home)) - mkdirs('./hg38_resources') - mkdirs('./images') - mkdirs('./v2/ihec/test_data') - - if '-clean' in args: - for d in ['./v2', './images', './hg38_resources']: - logerr('# removing {0}\n'.format(d)) - rm(d) - logerr('rm -rf ./v2/ images/ hg38_resources/ \n') - - if '-getref' in args: - get_hg38_resources(home) - - if '-refconfig' in args: - logerr(existing_ref_config('./ref_config.json') + '\n') - - if '-get' in args: - get_test_data('./test_config.json', home) - - if '-pullimage' in args: - params = [os.getcwd()] + [e for e in args if not e[0] == '-'] - binds = bindargs(args) - l_container_config = singularity_pull_image(home, args, binds, debug = False) - container = write_testrun(l_container_config) - logerr('# container: {0}\n'.format(container)) - - if '-maketests' in args: - make_tests(args) - - - logerrn("__finished__") - - + home = base() + logerr('# prefix {0}\n'.format(home)) + mkdirs('./hg38_resources') + mkdirs('./images') + mkdirs('./v2/ihec/test_data') + + if '-clean' in args: + for d in ['./v2', './images', './hg38_resources']: + logerr('# removing {0}\n'.format(d)) + rm(d) + logerr('rm -rf ./v2/ images/ hg38_resources/ \n') + + if '-getref' in args: + get_hg38_resources(home) + + if '-refconfig' in args: + logerr(existing_ref_config('./ref_config.json') + '\n') + + if '-get' in args: + get_test_data('./test_config.json', home) + + if '-pullimage' in args: + params = [os.getcwd()] + [e for e in args if not e[0] == '-'] + binds = bindargs(args) + container_config = singularity_pull_image(home, args, binds, debug=False) + container = write_testrun(container_config) + logerr('# container: {0}\n'.format(container)) + + if '-maketests' in args: + make_tests(args) + + + logerrn("__finished__") + + if __name__ == '__main__': - main(sys.argv[1:]) + main(sys.argv[1:]) diff --git a/encode-wrapper/computecanada_encode_test_tasks.sh b/encode-wrapper/computecanada_encode_test_tasks.sh new file mode 100755 index 0000000..98aa97e --- /dev/null +++ b/encode-wrapper/computecanada_encode_test_tasks.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +OUTPUTDIR=$1 + +JOB_OUTPUT=encode_test.$OUTPUTDIR.log +JOB_NAME=encode_test.$OUTPUTDIR +cat /dev/null > $JOB_OUTPUT +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2" +current_JOBID=$(echo "#!/bin/bash +$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=02:00:00 --mem-per-cpu=4700M -n 10 -N 1 | grep "[0-9]" | cut -d\ -f4) +echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/computecanada_resources.json b/encode-wrapper/computecanada_resources.json new file mode 100644 index 0000000..d83affb --- /dev/null +++ b/encode-wrapper/computecanada_resources.json @@ -0,0 +1,38 @@ +{ + "_COMMENT.General": "Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(chip.align_cpu x NUMBER_OF_REPLICATES, chip.call_peak_cpu x 2 x NUMBER_OF_REPLICATES) because align and call_peak (especially for spp) are bottlenecking tasks of the pipeline. Use this total number of cores if you manually qsub or sbatch your job (using local mode of Caper).", + "_COMMENT.General.Beluga": "The amount of mem per cpu is 4700M in Beluga so you can put \"chip.whatever_cpu\" x 4700 for any \"chip.whatever_mem_mb\".", + + "chip.bwa_cpu": 15, + "chip.bwa_mem_mb": 70500, + "chip.bwa_time_hr": 24, + + "chip.filter_cpu": 10, + "chip.filter_mem_mb": 47000, + "chip.filter_time_hr": 12, + + "chip.bam2ta_cpu": 8, + "chip.bam2ta_mem_mb": 37600, + "chip.bam2ta_time_hr": 6, + + "chip.spr_mem_mb": 18800, + + "chip.fingerprint_cpu": 6, + "chip.fingerprint_mem_mb": 28200, + "chip.fingerprint_time_hr": 24, + + "chip.xcor_cpu": 6, + "chip.xcor_mem_mb": 28200, + "chip.xcor_time_hr": 24, + + "chip.macs2_mem_mb": 18800, + "chip.macs2_time_hr": 24, + + "chip.spp_cpu": 2, + "chip.spp_mem_mb": 16000, + "chip.spp_time_hr": 24, + + "_COMMENT.Java.general": "There are special parameters to control maximum Java heap memory (e.g. java -Xmx4G) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter -Xmx. If these parameters are not defined then pipeline uses 90% of each task's memory (e.g. chip.filter_mem_mb).", + "chip.filter_picard_java_heap": "chip.filter_mem_mb", + "chip.align_trimmomatic_java_heap": "chip.align_mem_mb", + "chip.gc_bias_picard_java_heap": "10G" +} \ No newline at end of file diff --git a/encode-wrapper/computecanada_wrapper.sh b/encode-wrapper/computecanada_wrapper.sh new file mode 100755 index 0000000..8b7b4a6 --- /dev/null +++ b/encode-wrapper/computecanada_wrapper.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +PIPERUNNER=$1 +jobFile=$2 +BACKEND=$3 +if [[ $# -eq 4 ]]; then + OUTDIR=$4 +else + OUTDIR="" +fi + +filename=$(basename -- $2) +filename="${filename%.*}" + +JOB_OUTPUT=chipseq.$filename.log +JOB_NAME=chipseq.$filename +cat /dev/null > $JOB_OUTPUT +cmd="module load singularity/3.6 mugqic/java/openjdk-jdk1.8.0_72 && bash $1 $2 $3" +current_JOBID=$(echo "#!/bin/bash +$cmd" | sbatch --mail-type=END,FAIL --mail-user=$JOB_MAIL -A $RAP_ID -D $PWD -o $JOB_OUTPUT -J $JOB_NAME --time=48:00:00 --mem-per-cpu=4700M -n 20 -N 1 | grep "[0-9]" | cut -d\ -f4) +echo $current_JOBID submitted... \ No newline at end of file diff --git a/encode-wrapper/computemd5s.py b/encode-wrapper/computemd5s.py index bc57b76..fc78025 100644 --- a/encode-wrapper/computemd5s.py +++ b/encode-wrapper/computemd5s.py @@ -5,78 +5,74 @@ def findfiles(base, pattern): - cmd = "find {0} -name '{1}'".format(base, pattern) - print2(cmd) - p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - return [e.decode("utf-8").strip() for e in p.stdout.readlines()] + cmd = "find {0} -name '{1}'".format(base, pattern) + print2(cmd) + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return [e.decode("utf-8").strip() for e in p.stdout.readlines()] def flistsize(fs): - return {e : os.stat(e).st_size for e in fs} + return {e : os.stat(e).st_size for e in fs} def byino(fs): - hashed = dict() - for e in fs: - ino = os.stat(e).st_ino - if not ino in hashed: hashed[ino] = list() - hashed[ino].append(e) - hashed2 = dict() - full_flist = dict() - for k,v in hashed.items(): - sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) - hashed2[k] = sortedfiles[0] - assert not sortedfiles[0] in full_flist - full_flist[sortedfiles[0]] = sortedfiles[1:] - return (hashed2, full_flist) + hashed = dict() + for e in fs: + ino = os.stat(e).st_ino + if not ino in hashed: hashed[ino] = list() + hashed[ino].append(e) + hashed2 = dict() + full_flist = dict() + for k,v in hashed.items(): + sortedfiles = sorted(v, key = lambda x: (len(x), os.path.basename(x)) ) + hashed2[k] = sortedfiles[0] + assert not sortedfiles[0] in full_flist + full_flist[sortedfiles[0]] = sortedfiles[1:] + return (hashed2, full_flist) def md5script(hashed): - def cmd(f): - if f.strip().endswith('bam'): - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) - else: - return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4.sif md5sum {0})"'.format(f, os.path.basename(f)) - - return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] + def cmd(f): + if f.strip().endswith('bam'): + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0_7_1-rev1.sif ./headlessbam_md5 {0})"'.format(f, os.path.basename(f)) + else: + return 'echo "{1} $(singularity exec -B $PWD ./images/chip_seq_pipeline_v1_1_4-sambamba-0_7_1-rev1.sif md5sum {0})"'.format(f, os.path.basename(f)) + + return [cmd(v) for v in sorted(hashed.values(), key= lambda x: os.path.basename(x))] def trackoutput(base, i, filereport): - logerr('# looking in {0}\n'.format(base)) - bams = findfiles(base, '*.bam') - narrowpeaks = findfiles(base, '*narrow*gz') - (bamsbyino, bams_flist) = byino(bams) - (peaksbyino, peaks_flist) = byino(narrowpeaks) + logerr('# looking in {0}\n'.format(base)) + bams = findfiles(base, '*.bam') + narrowpeaks = findfiles(base, '*narrow*gz') + (bamsbyino, bams_flist) = byino(bams) + (peaksbyino, peaks_flist) = byino(narrowpeaks) + + if not filereport: + print2(writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino))) - if not filereport: - print2(writef('./computemd5s_{0}'.format(i), ['#!/bin/bash'] + md5script(bamsbyino) + md5script(peaksbyino))) - - qc = findfiles(base, 'qc.html') - print2(qc) + qc = findfiles(base, 'qc.html') + print2(qc) - return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1]} + return { 'bams' : bams_flist, 'peaks' : peaks_flist, 'qc' : byino(qc)[1]} def main(args): - assert len(args) == 2, '__only_one_target_directory_at_a_time__' - [targets, tag] = args #['.'] if len(args) == 0 else args - output = list() - keep = list() - - for i, arg in enumerate([targets]): - record = trackoutput(arg, tag, False) - output.append(record) - keep.extend(record['bams']) - keep.extend(record['peaks']) - - print2(jdumpf('./filereport.json', output)) - print2(jdumpf('./file_shortreport.json', list(map(lambda o: {k: sorted(o[k].keys()) for k in o}, output)))) - print2('size', sum(flistsize(keep).values())) - - + assert len(args) == 2, '__only_one_target_directory_at_a_time__' + [targets, tag] = args #['.'] if len(args) == 0 else args + output = list() + keep = list() -if __name__ == '__main__': - main(sys.argv[1:]) + for i, arg in enumerate([targets]): + record = trackoutput(arg, tag, False) + output.append(record) + keep.extend(record['bams']) + keep.extend(record['peaks']) + print2(jdumpf('./filereport.json', output)) + print2(jdumpf('./file_shortreport.json', list(map(lambda o: {k: sorted(o[k].keys()) for k in o}, output)))) + print2('size', sum(flistsize(keep).values())) +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/encode-wrapper/encode_test_tasks_run.sh b/encode-wrapper/encode_test_tasks_run.sh index 0f03891..e8e3b46 100755 --- a/encode-wrapper/encode_test_tasks_run.sh +++ b/encode-wrapper/encode_test_tasks_run.sh @@ -12,13 +12,9 @@ mkdir -p $testsOut || true cd $BASE/chip-seq-pipeline2/test/test_task echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" -for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do -#for t in test_bam2ta; do +for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do echo "# started: $t $(date)" $H/testrun_tasks.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND echo "# end: $t $(date) $?" echo "ok___________________" done - - - diff --git a/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh b/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh deleted file mode 100755 index 0a883e8..0000000 --- a/encode-wrapper/encode_test_tasks_run_ihec_slurm_singularity.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - - -BASE=$1 -BACKEND=$2 -tag=${3:-""} -H=$BASE - -chmod +x $H/testrun_tasks_ihec_slurm_singularity.sh -testsOut=$H/test_tasks_results_"$tag" -mkdir -p $testsOut || true -cd $BASE/chip-seq-pipeline2/test/test_task -echo "__container__:$BASE,$BACKEND,$PWD $(which python) $(which java) $PATH $PYTHONPATH" - -for t in test_bam2ta test_bwa test_choose_ctl test_filter test_idr test_macs2 test_merge_fastq test_overlap test_pool_ta test_reproducibility test_spp test_spr test_trim_fastq test_xcor; do -#for t in test_bam2ta; do - echo "# started: $t $(date)" - $H/testrun_tasks_ihec_slurm_singularity.sh $PWD/$t.wdl $PWD/$t.json $testsOut/$t.test_task_output.json $BACKEND - echo "# end: $t $(date) $?" - echo "ok___________________" -done - - - diff --git a/encode-wrapper/expected_md5s_h3k27me3.json b/encode-wrapper/expected_md5s_h3k27me3.json index 5b189bc..9d95679 100644 --- a/encode-wrapper/expected_md5s_h3k27me3.json +++ b/encode-wrapper/expected_md5s_h3k27me3.json @@ -1,21 +1,21 @@ { "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "018ad8f5f3158534320ed359563878d3" + "bcb0870d7fb36b19aa6e5bcfb15d5074" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "defd886ab7923b952e04ee033a722fac" + "a40596e82a52219105683fe6b2296414" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "b1ae4fb3f2b68b3c8346c57fa04f476f" + "8ebf1f3415468180c06f2845eb8b7ea1" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "0f38658b68706ec12b5faded1141750e" + "c2647c48949a708b170ce43a16542974" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "b1ac6ab70d053b546f186080639252ed" + "1c8896e5a37dd0ffd85819c158ac2473" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "1c9554fe8b67e61fd7c69a1881ec2e3a" + "83330d3dbd5b984e363bbb1f91c7af20" ], "ChIP-Seq.IX1239-A26688-GGCTAC.134224.D2B0LACXX.2.1.merged.nodup_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ "55de2037c6657d1027fb6b625822fa8b" @@ -27,24 +27,24 @@ "7a52f55148b47e2a48fac330e3672c96" ], "conservative_peak.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "conservative_peak.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "optimal_peak.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "optimal_peak.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "rep1-pr.overlap.bfilt.narrowPeak.gz": [ - "49fdef6c06796ab06e8ac2a1b88075d1" + "63c4167fc1a01635967d2b35391966b5" ], "rep1-pr.overlap.bfilt.narrowPeak.hammock.gz": [ - "b78724bb667cc7bbfece8a587c10c915" + "212aa0cc0a0a732280e59026ba8fda69" ], "rep1-pr.overlap.narrowPeak.gz": [ - "a896c1ec4693ddbd2e098ffa901c1f2a" + "4be7f27238a6f2818d052546ea7508ed" ] } \ No newline at end of file diff --git a/encode-wrapper/expected_md5s_h3k4me3.json b/encode-wrapper/expected_md5s_h3k4me3.json index 472cb8f..0354405 100644 --- a/encode-wrapper/expected_md5s_h3k4me3.json +++ b/encode-wrapper/expected_md5s_h3k4me3.json @@ -1,21 +1,21 @@ { "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "11df65c690dbde63772231167cc3e3c6" + "2b5b6d36b1ffc7312e975995665454a5" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "67d499fe7a0f5442f0c2fe943599b6bd" + "ff7c420e651a8eaadc5d1728ed7748cf" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr1_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "ce55328fad51c032eb6e532ebcc5a7ee" + "d160f97b59e457cef66f4646350287a1" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ - "76f750f47ac517d611c508dfd7de30e9" + "b1bf7417e9482329c7aa0052bc4cdc14" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.hammock.gz": [ - "157ece5658aa3e84aa036ce59c54d839" + "1754bfa54f29838274e3d89070a91a0c" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup.pr2_x_ctl_for_rep1.pval0.01.500K.narrowPeak.gz": [ - "12d46443a1470ec7fe4ea2b308aadde0" + "22e3b8448788c81e80cff2224a0516fa" ], "ChIP-Seq.IX1239-A28471-ATCACG.134224.D2B0LACXX.2.1.merged.nodup_x_ctl_for_rep1.pval0.01.500K.bfilt.narrowPeak.gz": [ "0b624f273c1114314fd5c484580f28f8" @@ -27,24 +27,24 @@ "c9661487f0a63f3a59dc76c91bb58550" ], "conservative_peak.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "conservative_peak.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "optimal_peak.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "optimal_peak.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "rep1-pr.overlap.bfilt.narrowPeak.gz": [ - "0f96fc19c232e8c501c7dcb4ca9ae8bc" + "0cd1911a2533ec29c269f23a667395a4" ], "rep1-pr.overlap.bfilt.narrowPeak.hammock.gz": [ - "97d194f5d92a3a1c1173795bbfd97548" + "38f01b1b3e28a589ad7af5b00ac49cba" ], "rep1-pr.overlap.narrowPeak.gz": [ - "8171d3af09f6ba3e69e995155ba8658f" + "fd19d314a10c6b41e0920332df678c7a" ] } \ No newline at end of file diff --git a/encode-wrapper/ihec_standard_workflow.md b/encode-wrapper/ihec_standard_workflow.md index c72e0a5..6c51557 100644 --- a/encode-wrapper/ihec_standard_workflow.md +++ b/encode-wrapper/ihec_standard_workflow.md @@ -1,4 +1,4 @@ -# IHEC ChIP-Seq standard workdlows +# IHEC ChIP-Seq standard workdfows See the ENCODE reference for input format: https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/docs/input.md diff --git a/encode-wrapper/readme.md b/encode-wrapper/readme.md index 8821f4b..abe4af0 100644 --- a/encode-wrapper/readme.md +++ b/encode-wrapper/readme.md @@ -18,9 +18,9 @@ By default it will use git over http. If you want to use ssh, then pass `ssh` as Run `python chip.py -get` to get IHEC ChIP test data for MCF10A cell line. -## Running on cluster +## Running on cluster (Compute Canada included) -For running on cluster with a slurm etc see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) +For running on cluster with scheduler like SLURM or PBS, and Compute Canada details see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). ## Memory requirements @@ -32,7 +32,7 @@ The analysis will generate a `qc.json` file (as well as an html version) along w ## Pulling Singularity image and generating wrapper scripts -These scripts require `python 3.6.8` or higher. It's assmumed that the the underlying OS supports `overlayfs` so paths that do not exist on the singularity can be mounted inside singularity (CentOS7 should work fine). CentOS6 does not have `overlayfs` support. If you need support for OS without `overlayfs` please make an issue. +These scripts require `python 3.6.8` or higher. It's assumed that the the underlying OS supports `overlayfs` so paths that do not exist on the singularity can be mounted inside singularity (CentOS7 should work fine). CentOS6 does not have `overlayfs` support. If you need support for OS without `overlayfs` please make an issue. Check singularity version with `singularity --version` to make sure it's at least `3.0.1`. @@ -46,18 +46,26 @@ This command will write: * piperunner.sh -* piperunner_ihec_slurm_singularity.sh - * testrun_tasks.sh -* testrun_tasks_ihec_slurm_singularity.sh - * singularity_encode_test_tasks.sh * singularity_wrapper.sh * trackoutput.sh +If you are running in `Local` mode using `./chip.py -pullimage -bindpwd $PWD/data_b $PWD/data_a` will mount `$PWD/data_b` as `/mnt/ext_0`, `$PWD/data_a` as `/mnt/ext_1` and so on, and it binds `$PWD` to `$PWD`. If you are on older systems without support for overlayFS, then passing `-pwd2ext0` will bind `$PWD` `/mnt/ext_0` and shift other bind points further along `ext_$i`'s. + +For example, + + python ./chip.py -pullimage -bindpwd -pwd2ext0 $PWD/v2/ihec + +will set up all binds so that after downloading the cemt0007 test data, you can just use `cemt0007_h3k27me3_mnt_ext_0.json` out of the box like: + + $ ./singularity_wrapper.sh cemt0007_h3k27me3_mnt_ext_0.json + +without needing to do `chip.py -maketests` as later described. + This will also create the singularity image in `./images`. Do `chmod +x ./*sh`. @@ -72,11 +80,19 @@ For example `python chip.py -pullimage -bindpwd -nobuild $PWD/v2/ihec/test_data/ ## Running tests + + + + + + ### ENCODE tests -To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. If you are on HPC and prefer to use SLURM, do `./encode_test_tasks_run_ihec_slurm_singularity.sh slurm_singularity try1`. +To run ENCODE test tasks, do `./singularity_encode_test_tasks.sh try1` to run it locally. The first argument is the config argument to cromwell (see ENCODE pipeline documentation). The output of tests will be written in `test_tasks_results_try1`. For testing in Compute Canada see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). -You will need atleast 10G of memory for running the encode tasks. +You will need at least 10G of memory for running the encode test tasks. Make sure all test pass, by looking through jsons generated. `./status_encode_tasks.py` can be used here. @@ -113,17 +129,19 @@ IHEC tests on Local mode can be run with: `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` and `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json` -You can also use SLURM with with the pipeline; please see [cluster](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1) section. It's recommended that `singularity_wrapper.sh` is used instead for simplicity. + -`./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k4me3.json slurm_singularity h3k4me3_out` and `./piperunner_ihec_slurm_singularity.sh ./v2/ihec/cemt0007_h3k27me3.json slurm_singularity h3k27me3_out` +For testing in Compute Canada see [this section](https://github.com/IHEC/integrative_analysis_chip/blob/dev-organize-output/encode-wrapper/readme.md#running-on-cluster-1). The provided configuration files are for 75bp PET only. Standard configration files for SET and read lengths will be provided. The ENCODE documentation discusses other modes. For these tests, the running time can be 24 hours depending on hardware. -To compute md5s of generated file, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without the bam header as that may contain full paths names. +To compute md5s of generated files, use `computemd5s.py ` with `` being the output directory of previous step and `` being the suffix to add at file output basename `computemd5s_`. This will locate peak calls and bam files, and generate scripts to compute the md5s. Note the bam md5s are generated without the bam header as that may contain full paths names. -As an example, supose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `outdir=$PWD/cromwell-executions/chip/93de85aa-d581-48df-b8ae-a91a6e88a21f`. So do +As an example, suppose output of `./singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json` is in `outdir=$PWD/cromwell-executions/chip/93de85aa-d581-48df-b8ae-a91a6e88a21f`. So do python computemd5s.py $outdir test # the first must be the cromwell directory for the analysis, the second a suffix for the script chmod +x ./computemd5s_test @@ -163,14 +181,28 @@ See output of `./trackoutput.sh -outdir:$outdi ./unresolvedfiles.list # files that will be kept, but cannot be accessed as they may be hardlinks that cannot be resolved ./unexpectedfiles.list # extraneous cromwell files that do not match patterns for expected cromwell files -Cromwell generates large number of files by creating hardlinks; this script attempts to resolve these links and keeping only one copy of each. `./unresolvedfiles.list` contains hardlinks that the script is unable to resolve because of mount issues or othet OS errors. +Cromwell generates large number of files by creating hardlinks; this script attempts to resolve these links and keeping only one copy of each. `./unresolvedfiles.list` contains hardlinks that the script is unable to resolve because of mount issues or other OS errors. It's expected that `unresolvedfiles.list` and `unexpectedfiles.list` are empty. If they are not empty, the files listed there will need to be looked at. Please review files before deleting to ensure nothing useful is removed. -The recommended workflow is to consider removing files from `delete.list` only (in case diskspace is an issue). And then symlink files from masterfiles.list (while keeping everything else) to a final analysis directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. +The recommended workflow is to consider removing files from `delete.list` only (in case disk-space is an issue). And then symlink files from masterfiles.list (while keeping everything else) to a final analysis directory. So all files other than input files and intermediate bam files are still available inside the cromwell directory but the output directory is organized and free of extra logs files and scripts. ## Running on cluster -While the slurm_backend as defined by the encode pipeline will/should work; however, it's recommended that to run analysis on the cluster using slurm (or alternatives) just submit the a shell script containing the `./singularity_wrapper.sh $config` command. This means the entire job will run inside the container on one node on the cluster (i.e. the job will run in Local mode on the node it's submitted to). Using `slurm_singularity` backends (see [ENCODE documentation](https://encode-dcc.github.io/wdl-pipelines/install.html)) will mean cromwell will run on the head node (or where ever the job was launched from), and it will manage farming out each individual task to the cluster, with each task run in its own instance of singularity. +While the slurm_backend as defined by the encode pipeline will/should work; however, it's recommended to run analysis on the cluster using slurm (or alternatives) just submit a shell script containing the `./singularity_wrapper.sh $config` command. This means the entire job will run inside the container on one node on the cluster (i.e. the job will run in Local mode on the node it's submitted to). Using `slurm_singularity` backends (see [ENCODE documentation](https://encode-dcc.github.io/wdl-pipelines/install.html)) will mean cromwell will run on the head node (or where ever the job was launched from), and it will manage farming out each individual task to the cluster, with each task run in its own instance of singularity. + +### Compute Canada + +If you are a Compute Canada user you can customize resources for different steps by using the file compute_canada_resources.json. +To merge the resources.json and the input.json: `jq -s '.[0] * .[1]' input.json computecanada_resources.json > output_merged.json` + +To setup the pipeline you need to do the following: +- Load singularity by doing `module load singularity/3.7` and setup the default folder for pulling the image `mkdir -p /localscratch/$USER ; export SINGULARITY_TMPDIR=/localscratch/$USER` +- Adding `export MUGQIC_INSTALL_HOME=/cvmfs/soft.mugqic/CentOS6` and `module use $MUGQIC_INSTALL_HOME/modulefiles` in your `.bashrc`. +- Pulling all the ressources needed for running the tests `./get_encode_resources.sh && python chip.py -get -pullimage -bindpwd -maketests && chmod +x *.sh` +- Use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh`. Usage: `./computecanada_wrapper.sh singularity_wrapper.sh input.json output_dir` with the output_dir behaving the same as for `singularity_wrapper.sh`. This wrapper script is designed to use 20 CPUs and 4700M of RAM per CPU (half a full node on Beluga), it can be customized to fit the user needs. + +To do ENCODE testing run: `./computecanada_encode_test_tasks.sh singularity_encode_test_tasks.sh try1` instead of `./singularity_encode_test_tasks.sh try1` and then follow the standard procedure for checking results. +To do MCF10A testing use `computecanada_wrapper.sh` instead of `singularity_wrapper.sh` as follows: `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k4me3.json h3k4me3_out` and `./computecanada_wrapper.sh singularity_wrapper.sh ./v2/ihec/cemt0007_h3k27me3.json h3k27me3_out`, then follow the standard procedure for checking results (the checking scripts need sambamba, you can do `module load mugqic/sambamba` to have it available in your environment). diff --git a/encode-wrapper/status_encode_tasks.py b/encode-wrapper/status_encode_tasks.py index 681ae94..379d6cc 100644 --- a/encode-wrapper/status_encode_tasks.py +++ b/encode-wrapper/status_encode_tasks.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from utilsm import * import sys import glob diff --git a/encode-wrapper/testrun_template.sh b/encode-wrapper/testrun_template.sh index 1339aa9..619fe41 100755 --- a/encode-wrapper/testrun_template.sh +++ b/encode-wrapper/testrun_template.sh @@ -2,7 +2,7 @@ jobFile=$1 BACKEND=$2 -if [[ $# -eq 3 ]]; then +if [[ $# -eq 3 ]]; then OUTDIR="-Dbackend.providers.$BACKEND.config.root=$3" else OUTDIR="" @@ -13,5 +13,5 @@ BACKEND_CONF="{backend}" WORKFLOW_OPT="{container}" CHIP="{wdl}" -java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $OUTDIR cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT +java -jar -Dconfig.file=$BACKEND_CONF -Dbackend.default=$BACKEND $OUTDIR $CROMWELL_HOME/cromwell-34.jar run $CHIP -i $jobFile -o $WORKFLOW_OPT echo "return:$?" diff --git a/encode-wrapper/utilsm.py b/encode-wrapper/utilsm.py index de4675a..d7208a7 100644 --- a/encode-wrapper/utilsm.py +++ b/encode-wrapper/utilsm.py @@ -77,7 +77,3 @@ def by_keyvalue(alist, k, v): hashed[ke] = list() hashed[ke].append(ve) return hashed - - - - diff --git a/encode-wrapper/utilsm.pyc b/encode-wrapper/utilsm.pyc new file mode 100644 index 0000000..36714a2 Binary files /dev/null and b/encode-wrapper/utilsm.pyc differ diff --git a/readme.md b/readme.md index c6838f6..f818d78 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,9 @@ # IHEC wrapper for ENCODE ChIP-Seq pipeline. +This is the IHEC wrapper around the ENCODE ChIP-Seq pipeline that captures that IHEC specific configurations for analyzing ChIP-Seq data through the ENCODE pipeline. + +For an introduction to the ENCODE ChIP-Seq pipeline see: [ENCODE-DCC/chip-seq-pipeline2](https://github.com/ENCODE-DCC/chip-seq-pipeline2) + Please see the documentation available [here](encode-wrapper/readme.md) to set up and test the pipeline. Also refer to IHEC standard [workflows](ihec_standard_workflow.md)