Merge pull request #214 from ENCODE-DCC/dev

leepc12 · web-flow · commit 2d51d3486a3c · 2021-02-23T12:15:04.000-08:00
v1.7.1
diff --git a/README.md b/README.md
@@ -5,6 +5,11 @@
 
 ## Important notice for Conda users
 
+If it takes too long to resolve Conda package conflicts while installing pipeline's Conda environment, then try with `mamba` instead. Add `mamba` to the install command line.
+```bash
+$ scripts/install_conda_env.sh mamba
+```
+
 For every new pipeline release, Conda users always need to update pipeline's Conda environment (`encode-chip-seq-pipeline`) even though they don't use new added features.
 ```bash
 $ cd chip-seq-pipeline2
@@ -83,6 +88,11 @@ An input JSON file specifies all the input parameters and files that are necessa
 1) [Input JSON file specification (short)](docs/input_short.md)
 2) [Input JSON file specification (long)](docs/input.md)
 
+## Running and sharing on Truwl
+You can run this pipeline on [truwl.com](https://truwl.com/). This provides a web interface that allows you to define inputs and parameters, run the job on GCP, and monitor progress. To run it you will need to create an account on the platform then request early access by emailing [info@truwl.com](mailto:info@truwl.com) to get the right permissions. You can see the example cases from this repo at [https://truwl.com/workflows/instance/WF_dd6938.8f.340f/command](https://truwl.com/workflows/instance/WF_dd6938.8f.340f/command) and [https://truwl.com/workflows/instance/WF_dd6938.8f.8aa3/command](https://truwl.com/workflows/instance/WF_dd6938.8f.8aa3/command). The example jobs (or other jobs) can be forked to pre-populate the inputs for your own job.
+
+If you do not run the pipeline on Truwl, you can still share your use-case/job on the platform by getting in touch at [info@truwl.com](mailto:info@truwl.com) and providing your inputs.json file.
+
 ## Running a pipeline on DNAnexus
 
 You can also run this pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL.
diff --git a/chip.wdl b/chip.wdl
@@ -1,15 +1,16 @@
 version 1.0
 
 workflow chip {
-    String pipeline_ver = 'v1.7.0'
+    String pipeline_ver = 'v1.7.1'
 
     meta {
+        version: 'v1.7.1'
         author: 'Jin wook Lee (leepc12@gmail.com) at ENCODE-DCC'
         description: 'ENCODE TF/Histone ChIP-Seq pipeline'
         specification_document: 'https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit?usp=sharing'
 
-        caper_docker: 'encodedcc/chip-seq-pipeline:v1.7.0'
-        caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.7.0'
+        caper_docker: 'encodedcc/chip-seq-pipeline:v1.7.1'
+        caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.7.1'
         croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.v5.json'
 
         parameter_group: {
@@ -183,15 +184,15 @@ workflow chip {
         Int filter_cpu = 4
         Float filter_mem_factor = 0.4
         Int filter_time_hr = 24
-        Float filter_disk_factor = 6.0
+        Float filter_disk_factor = 8.0
 
         Int bam2ta_cpu = 2
         Float bam2ta_mem_factor = 0.35
         Int bam2ta_time_hr = 6
         Float bam2ta_disk_factor = 4.0
 
-        Float spr_mem_factor = 4.5
-        Float spr_disk_factor = 6.0
+        Float spr_mem_factor = 13.5
+        Float spr_disk_factor = 18.0
 
         Int jsd_cpu = 4
         Float jsd_mem_factor = 0.1
@@ -203,19 +204,19 @@ workflow chip {
         Int xcor_time_hr = 24
         Float xcor_disk_factor = 4.5
 
-        Float subsample_ctl_mem_factor = 7.0
-        Float subsample_ctl_disk_factor = 7.5
+        Float subsample_ctl_mem_factor = 14.0
+        Float subsample_ctl_disk_factor = 15.0
 
-        Float macs2_signal_track_mem_factor = 6.0
+        Float macs2_signal_track_mem_factor = 12.0
         Int macs2_signal_track_time_hr = 24
-        Float macs2_signal_track_disk_factor = 40.0
+        Float macs2_signal_track_disk_factor = 80.0
 
         Int call_peak_cpu = 6
         Float call_peak_spp_mem_factor = 5.0
-        Float call_peak_macs2_mem_factor = 2.5
+        Float call_peak_macs2_mem_factor = 5.0
         Int call_peak_time_hr = 72
         Float call_peak_spp_disk_factor = 5.0
-        Float call_peak_macs2_disk_factor = 15.0
+        Float call_peak_macs2_disk_factor = 30.0
 
         String? align_trimmomatic_java_heap
         String? filter_picard_java_heap
@@ -2776,7 +2777,7 @@ task gc_bias {
         cpu : 1
         memory : '${mem_gb} GB'
         time : 6
-        disks : 'local-disk 100 SSD'
+        disks : 'local-disk 150 SSD'
     }
 }
 
diff --git a/dev/docker_image/Dockerfile b/dev/docker_image/Dockerfile
@@ -90,8 +90,9 @@ RUN pip3 install --no-cache-dir SAMstats==0.2.1
 RUN git clone --branch 2.0.4.2 --single-branch https://github.com/kundajelab/idr && \
     cd idr && python3 setup.py install && cd ../ && rm -rf idr*
 
-# Install system/math python packages (python2)
-RUN pip2 install --no-cache-dir numpy matplotlib==2.2.4
+# Install system/math python packages and biopython
+RUN pip2 install --no-cache-dir numpy scipy matplotlib==2.2.4 bx-python==0.8.2 biopython==1.76
+RUN pip3 install --no-cache-dir biopython==1.76
 
 # Install genomic python packages (python2)
 RUN pip2 install --no-cache-dir metaseq==0.5.6
diff --git a/docs/input.md b/docs/input.md
@@ -256,7 +256,7 @@ Parameter|Default|Description
 `chip.filter_cpu` | 4 |
 `chip.filter_mem_factor` | 0.4 | Multiplied to size of BAM to determine required memory
 `chip.filter_time_hr` | 24 | Walltime (HPCs only)
-`chip.filter_disk_factor` | 6.0 | Multiplied to size of BAM to determine required disk
+`chip.filter_disk_factor` | 8.0 | Multiplied to size of BAM to determine required disk
 
 Parameter|Default|Description
 ---------|-------|-----------
@@ -267,8 +267,8 @@ Parameter|Default|Description
 
 Parameter|Default|Description
 ---------|-------|-----------
-`chip.spr_mem_factor` | 4.5 | Multiplied to size of filtered BAM to determine required memory
-`chip.spr_disk_factor` | 6.0 | Multiplied to size of filtered BAM to determine required disk
+`chip.spr_mem_factor` | 13.5 | Multiplied to size of filtered BAM to determine required memory
+`chip.spr_disk_factor` | 18.0 | Multiplied to size of filtered BAM to determine required disk
 
 Parameter|Default|Description
 ---------|-------|-----------
@@ -288,22 +288,22 @@ Parameter|Default|Description
 ---------|-------|-----------
 `chip.call_peak_cpu` | 6 | Used for both peak callers (`spp` and `macs2`). `spp` is well multithreaded but `macs2` is single-threaded. More than 2 is not required for `macs2`.
 `chip.call_peak_spp_mem_factor` | 5.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
-`chip.call_peak_macs2_mem_factor` | 2.5 | Multiplied to size of TAG-ALIGN BED to determine required memory
+`chip.call_peak_macs2_mem_factor` | 5.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
 `chip.call_peak_time_hr` | 24 | Walltime (HPCs only)
 `chip.call_peak_spp_disk_factor` | 5.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
-`chip.call_peak_macs2_disk_factor` | 15.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
+`chip.call_peak_macs2_disk_factor` | 30.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
 
 Parameter|Default|Description
 ---------|-------|-----------
-`chip.macs2_signal_track_mem_factor` | 6.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
+`chip.macs2_signal_track_mem_factor` | 12.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
 `chip.macs2_signal_track_time_hr` | 24 | Walltime (HPCs only)
-`chip.macs2_signal_track_disk_factor` | 40.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
+`chip.macs2_signal_track_disk_factor` | 80.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
 
 Parameter|Default|Description
 ---------|-------|-----------
-`chip.subsample_ctl_mem_factor` | 7.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
+`chip.subsample_ctl_mem_factor` | 14.0 | Multiplied to size of TAG-ALIGN BED to determine required memory
 `chip.macs2_signal_track_time_hr` | 24 | Walltime (HPCs only)
-`chip.subsample_ctl_disk_factor` | 7.5 | Multiplied to size of TAG-ALIGN BED to determine required disk
+`chip.subsample_ctl_disk_factor` | 15.0 | Multiplied to size of TAG-ALIGN BED to determine required disk
 
 If your system/cluster does not allow large memory allocation for Java applications, check the following resource parameters to manually define Java memory. It is **NOT RECOMMENDED** for most users to change these parameters since pipeline automatically takes 90% of task's memory for Java apps.
 
diff --git a/docs/install_conda.md b/docs/install_conda.md
@@ -32,11 +32,11 @@ If you do not have miniconda (or anaconda) installed, follow the instructions be
 
 4) **IMPORTANT**: Close your session and re-login.
 
-5) Install pipeline's Conda environment.
+5) Install pipeline's Conda environment. Add `mamba` to the install command line to resolve conflicts much faster. 
 
   ```bash
   $ bash scripts/uninstall_conda_env.sh  # uninstall it for clean-install
-  $ bash scripts/install_conda_env.sh
+  $ bash scripts/install_conda_env.sh mamba  # remove mamba if it does not work
   ```
 
 > **WARNING**: DO NOT PROCEED TO RUN PIPELINES UNTIL YOU SEE THE FOLLOWING SUCCESS MESSAGE OR PIPELINE WILL NOT WORK.
diff --git a/scripts/install_conda_env.sh b/scripts/install_conda_env.sh
@@ -9,11 +9,28 @@ REQ_TXT_PY3=${SH_SCRIPT_DIR}/requirements.txt
 REQ_TXT_PY2=${SH_SCRIPT_DIR}/requirements_py2.txt
 SRC_DIR=${SH_SCRIPT_DIR}/../src
 
-conda --version  # check if conda exists
+echo "=== Checking conda version ==="
+conda --version
 
 echo "=== Installing pipeline's Conda environments ==="
-conda create -n ${CONDA_ENV_PY3} --file ${REQ_TXT_PY3} -y -c defaults -c r -c bioconda -c conda-forge
-conda create -n ${CONDA_ENV_PY2} --file ${REQ_TXT_PY2} -y -c defaults -c r -c bioconda -c conda-forge
+
+if [[ "$1" == mamba ]]; then
+  conda install mamba -y -c conda-forge
+  mamba create -n ${CONDA_ENV_PY3} --file ${REQ_TXT_PY3} -y -c defaults -c r -c bioconda -c conda-forge
+  mamba create -n ${CONDA_ENV_PY2} --file ${REQ_TXT_PY2} -y -c defaults -c r -c bioconda -c conda-forge
+else
+  echo
+  echo "If it takes too long to resolve conflicts, then try with mamba."
+  echo
+  echo "Usage: ./install_conda_env.sh mamba"
+  echo
+  echo "mamba will resolve conflicts much faster then the original conda."
+  echo "If you get another conflict in the mamba installation step itself "
+  echo "Then you may need to clean-install miniconda3 and re-login."
+  echo
+  conda create -n ${CONDA_ENV_PY3} --file ${REQ_TXT_PY3} -y -c defaults -c r -c bioconda -c conda-forge
+  conda create -n ${CONDA_ENV_PY2} --file ${REQ_TXT_PY2} -y -c defaults -c r -c bioconda -c conda-forge
+fi
 
 echo "=== Configuring for pipeline's Conda environments ==="
 CONDA_PREFIX_PY3=$(conda env list | grep -E "\b${CONDA_ENV_PY3}[[:space:]]" | awk '{if (NF==3) print $3; else print $2}')