ENCODE-DCC
diff --git a/‎.circleci/config.yml‎
Lines changed: 0 additions & 1 deletion b/‎.circleci/config.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎chip.wdl‎
Lines changed: 12 additions & 8 deletions b/‎chip.wdl‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎dev/build_on_dx_dockerhub.sh‎
Lines changed: 1 addition & 1 deletion b/‎dev/build_on_dx_dockerhub.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/encode_lib_common.py‎
Lines changed: 28 additions & 0 deletions b/‎src/encode_lib_common.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/encode_lib_genomic.py‎
Lines changed: 73 additions & 30 deletions b/‎src/encode_lib_genomic.py‎
Lines changed: 73 additions & 30 deletions
diff --git a/‎src/encode_task_bwa.py‎
Lines changed: 26 additions & 20 deletions b/‎src/encode_task_bwa.py‎
Lines changed: 26 additions & 20 deletions
@@ -285,4 +285,3 @@ workflows:
       - test_workflow_true_rep_only_pbam_pe:
           requires:
             - build
-
@@ -1,16 +1,16 @@
 version 1.0
 
 workflow chip {
-    String pipeline_ver = 'v1.8.0'
+    String pipeline_ver = 'v1.8.1'
 
     meta {
-        version: 'v1.8.0'
+        version: 'v1.8.1'
         author: 'Jin wook Lee (leepc12@gmail.com) at ENCODE-DCC'
         description: 'ENCODE TF/Histone ChIP-Seq pipeline'
         specification_document: 'https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit?usp=sharing'
 
-        caper_docker: 'encodedcc/chip-seq-pipeline:v1.8.0'
-        caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.8.0'
+        caper_docker: 'encodedcc/chip-seq-pipeline:v1.8.1'
+        caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.8.1'
         croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.v5.json'
 
         parameter_group: {
@@ -2463,20 +2463,22 @@ task count_signal_track {
         File? ta             # tag-align
         File chrsz            # 2-col chromosome sizes file
     }
+    Float mem_gb = 8.0
 
     command {
         set -e
         python3 $(which encode_task_count_signal_track.py) \
             ${ta} \
-            ${'--chrsz ' + chrsz}
+            ${'--chrsz ' + chrsz} \
+            ${'--mem-gb ' + mem_gb}
     }
     output {
         File pos_bw = glob('*.positive.bigwig')[0]
         File neg_bw = glob('*.negative.bigwig')[0]
     }
     runtime {
         cpu : 1
-        memory : '8 GB'
+        memory : '${mem_gb} GB'
         time : 4
         disks : 'local-disk 50 SSD'
     }
@@ -2547,7 +2549,8 @@ task call_peak {
                 ${'--chrsz ' + chrsz} \
                 ${'--fraglen ' + fraglen} \
                 ${'--cap-num-peak ' + cap_num_peak} \
-                ${'--pval-thresh '+ pval_thresh}
+                ${'--pval-thresh '+ pval_thresh} \
+                ${'--mem-gb ' + mem_gb}
 
         elif [ '${peak_caller}' == 'spp' ]; then
             python3 $(which encode_task_spp.py) \
@@ -2614,7 +2617,8 @@ task macs2_signal_track {
             ${'--gensz '+ gensz} \
             ${'--chrsz ' + chrsz} \
             ${'--fraglen ' + fraglen} \
-            ${'--pval-thresh '+ pval_thresh}
+            ${'--pval-thresh '+ pval_thresh} \
+            ${'--mem-gb ' + mem_gb}
     }
     output {
         File pval_bw = glob('*.pval.signal.bigwig')[0]
 
@@ -3,7 +3,7 @@ set -e
 
 VER=$(cat chip.wdl | grep "String pipeline_ver = " | awk '{gsub("'"'"'",""); print $4}')
 DOCKER=encodedcc/chip-seq-pipeline:$VER
-DXWDL=~/dxWDL-v1.46.4.jar
+DXWDL=~/dxWDL-v1.50.jar
 
 # check if docker image exists on dockerhub
 docker pull $DOCKER
 
@@ -280,6 +280,34 @@ def copy_f_to_dir(f, out_dir):  # copy 'f' to 'out_dir'/'f'
     return copy_f_to_f(f, dest)
 
 
+def get_gnu_sort_param(max_mem_job, ratio=0.5):
+    """Get a string of parameters for GNU sort according to maximum memory of a job/instance.
+
+    For GNU `sort`, `-S` or `--buffer-size` defines the buffer size for the sorting,
+    which defaults to max(available_mem, 1/8 * total_mem) of a node/instance.
+
+    sort -S [SIZE][UNIT] ...
+
+    See the following link for details.
+    https://github.com/coreutils/coreutils/blob/master/src/sort.c#L1492
+
+    This can be a problem if a job is assigned with a limited amount of memory,
+    but the job runs on a large node (e.g. 256GB of memory).
+
+    `-S` defines an INITIAL buffer size and it will automatically grow
+    if more memory is needed by `sort`.
+
+
+    Args:
+        max_mem_job:
+            Maximum amount of memory for a job/instance in bytes.
+        ratio:
+            Ratio to define the buffer size according to `max_mem_job`.
+    """
+    mem_mb = int(math.ceil(max_mem_job * ratio / (1024 * 1024)))
+    return '-S {mem_mb}M'.format(mem_mb=mem_mb)
+
+
 def now():
     return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
 
 
@@ -10,6 +10,7 @@
 import subprocess
 
 from encode_lib_common import (
+    get_gnu_sort_param,
     get_num_lines,
     get_peak_type,
     human_readable_number,
@@ -324,7 +325,7 @@ def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir):
 # convert encode peak file to hammock (for Wash U browser track)
 
 
-def peak_to_hammock(peak, out_dir):
+def peak_to_hammock(peak, mem_gb, out_dir):
     peak_type = get_peak_type(peak)
     prefix = os.path.join(out_dir, os.path.basename(
         strip_ext_peak(peak)))
@@ -335,14 +336,26 @@ def peak_to_hammock(peak, out_dir):
     hammock_gz_tbi = '{}.gz.tbi'.format(hammock)
 
     if get_num_lines(peak) == 0:
-        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz)
-        run_shell_cmd(cmd)
-        cmd2 = 'touch {}'.format(hammock_gz_tbi)
+        run_shell_cmd(
+            'zcat -f {peak} | gzip -nc > {hammock_gz}'.format(
+                peak=peak,
+                hammock_gz=hammock_gz,
+            )
+        )
+        run_shell_cmd(
+            'touch {hammock_gz_tbi}'.format(
+                hammock_gz_tbi=hammock_gz_tbi,
+            )
+        )
     else:
-        cmd = "zcat -f {} | "
-        cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
-        cmd = cmd.format(peak, hammock_tmp)
-        run_shell_cmd(cmd)
+        run_shell_cmd(
+            'zcat -f {peak} | '
+            'LC_COLLATE=C sort -k1,1V -k2,2n {sort_param} > {hammock_tmp}'.format(
+                peak=peak,
+                sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
+                hammock_tmp=hammock_tmp,
+            )
+        )
 
         with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout:
             id = 1
@@ -390,13 +403,22 @@ def peak_to_hammock(peak, out_dir):
 
                 fout.write('\n')
 
-        cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
-        cmd2 = cmd2.format(hammock_tmp2, hammock_gz)
-        run_shell_cmd(cmd2)
-        cmd3 = 'tabix -f -p bed {}'.format(hammock_gz)
-        run_shell_cmd(cmd3)
+        run_shell_cmd(
+            'zcat -f {hammock_tmp2} | sort -k1,1 -k2,2n {sort_param} | '
+            'bgzip -cf > {hammock_gz}'.format(
+                hammock_tmp2=hammock_tmp2,
+                sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
+                hammock_gz=hammock_gz,
+            )
+        )
+        run_shell_cmd(
+            'tabix -f -p bed {hammock_gz}'.format(
+                hammock_gz=hammock_gz,
+            )
+        )
 
         rm_f([hammock, hammock_tmp, hammock_tmp2])
+
     return (hammock_gz, hammock_gz_tbi)
 
 
@@ -435,7 +457,7 @@ def starch_to_bed_gz(starch, out_dir):
     return bed_gz
 
 
-def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
+def peak_to_bigbed(peak, peak_type, chrsz, mem_gb, out_dir):
     prefix = os.path.join(out_dir,
                           os.path.basename(strip_ext(peak)))
     bigbed = '{}.{}.bb'.format(prefix, peak_type)
@@ -460,7 +482,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
     int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
 )
 '''
-        bed_param = '-type=bed6+4 -as={}'.format(as_file)
+        bed_param = '-type=bed6+4 -as={as_file}'.format(as_file=as_file)
     elif peak_type.lower() == 'broadpeak':
         as_file_contents = '''table broadPeak
 "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
@@ -476,7 +498,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
     float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
 )
 '''
-        bed_param = '-type=bed6+3 -as={}'.format(as_file)
+        bed_param = '-type=bed6+3 -as={as_file}'.format(as_file=as_file)
     elif peak_type.lower() == 'gappedpeak':
         as_file_contents = '''table gappedPeak
 "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
@@ -498,26 +520,47 @@ def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
     float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
 )
 '''
-        bed_param = '-type=bed12+3 -as={}'.format(as_file)
+        bed_param = '-type=bed12+3 -as={as_file}'.format(as_file=as_file)
     else:
-        raise Exception('Unsupported peak file type {}!'.format(peak_type))
+        raise Exception('Unsupported peak file type {peak_type}!'.format(peak_type=peak_type))
 
     # create temporary .as file
     with open(as_file, 'w') as fp:
         fp.write(as_file_contents)
 
-    cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp)
-    run_shell_cmd(cmd1)
-    cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | "
-    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
-    cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}'
-    cmd2 = cmd2.format(peak, bigbed_tmp)
-    run_shell_cmd(cmd2)
-    cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2)
-    run_shell_cmd(cmd3)
-    cmd4 = "bedToBigBed {} {} {} {}".format(
-        bed_param, bigbed_tmp2, chrsz_tmp, bigbed)
-    run_shell_cmd(cmd4)
+    run_shell_cmd(
+        'cat {chrsz} > {chrsz_tmp}'.format(
+            chrsz=chrsz,
+            chrsz_tmp=chrsz_tmp,
+        )
+    )
+
+    run_shell_cmd(
+        'zcat -f {peak} | LC_COLLATE=C sort -k1,1 -k2,2n {sort_param} | '
+        'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
+        'if ($5<0) $5=0; print $0}}\' > {bigbed_tmp}'.format(
+            peak=peak,
+            sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
+            bigbed_tmp=bigbed_tmp,
+        )
+    )
+
+    run_shell_cmd(
+        'bedClip {bigbed_tmp} {chrsz_tmp} {bigbed_tmp2}'.format(
+            bigbed_tmp=bigbed_tmp,
+            chrsz_tmp=chrsz_tmp,
+            bigbed_tmp2=bigbed_tmp2,
+        )
+    )
+
+    run_shell_cmd(
+        'bedToBigBed {bed_param} {bigbed_tmp2} {chrsz_tmp} {bigbed}'.format(
+            bed_param=bed_param,
+            bigbed_tmp2=bigbed_tmp2,
+            chrsz_tmp=chrsz_tmp,
+            bigbed=bigbed,
+        )
+    )
 
     # remove temporary files
     rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2])
 
@@ -9,7 +9,9 @@
 import argparse
 from encode_lib_common import (
     get_num_lines, log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_fastq,
-    strip_ext_tar, untar)
+    strip_ext_tar, untar,
+    get_gnu_sort_param,
+)
 from encode_lib_genomic import (
     get_read_length, samtools_sort, bam_is_empty, get_samtools_res_param)
 
@@ -43,7 +45,8 @@ def parse_arguments():
     parser.add_argument('--nth', type=int, default=1,
                         help='Number of threads to parallelize.')
     parser.add_argument('--mem-gb', type=float,
-                        help='Max. memory for samtools sort in GB. '
+                        help='Max. memory for samtools sort and GNU sort -S '
+                        '(half of this value will be used for GNU sort) in GB. '
                         'It should be total memory for this task (not memory per thread).')
     parser.add_argument('--out-dir', default='', type=str,
                         help='Output directory.')
@@ -147,27 +150,30 @@ def bwa_pe(fastq1, fastq2, ref_index_prefix, nth, mem_gb, use_bwa_mem_for_pe,
         sai1 = bwa_aln(fastq1, ref_index_prefix, nth, out_dir)
         sai2 = bwa_aln(fastq2, ref_index_prefix, nth, out_dir)
 
-        cmd = 'bwa sampe {} {} {} {} {} | gzip -nc > {}'.format(
-            ref_index_prefix,
-            sai1,
-            sai2,
-            fastq1,
-            fastq2,
-            sam)
+        cmd = 'bwa sampe {ref_index_prefix} {sai1} {sai2} {fastq1} {fastq2} | gzip -nc > {sam}'.format(
+            ref_index_prefix=ref_index_prefix,
+            sai1=sai1,
+            sai2=sai2,
+            fastq1=fastq1,
+            fastq2=fastq2,
+            sam=sam,
+        )
         temp_files.extend([sai1, sai2, sam])
     run_shell_cmd(cmd)
 
-    cmd2 = 'zcat -f {} | '
-    cmd2 += 'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
-    cmd2 += '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
-    cmd2 += 'n = split(cigar,vals,"[A-Z]"); s = 0; '
-    cmd2 += 'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
-    cmd2 += 'if (s!=seqlen) print $1"\\t"; }}\' | '
-    cmd2 += 'sort | uniq > {}'
-    cmd2 = cmd2.format(
-        sam,
-        badcigar)
-    run_shell_cmd(cmd2)
+    run_shell_cmd(
+        'zcat -f {sam} | '
+        'awk \'BEGIN {{FS="\\t" ; OFS="\\t"}} ! /^@/ && $6!="*" '
+        '{{ cigar=$6; gsub("[0-9]+D","",cigar); '
+        'n = split(cigar,vals,"[A-Z]"); s = 0; '
+        'for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10); '
+        'if (s!=seqlen) print $1"\\t"; }}\' | '
+        'sort {sort_param} | uniq > {badcigar}'.format(
+            sam=sam,
+            sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
+            badcigar=badcigar,
+        )
+    )
 
     # Remove bad CIGAR read pairs
     if get_num_lines(badcigar) > 0: