diff --git a/dockers/dictys_v4/Dockerfile b/dockers/dictys_v4/Dockerfile new file mode 100644 index 000000000..bf3cfa0e4 --- /dev/null +++ b/dockers/dictys_v4/Dockerfile @@ -0,0 +1,122 @@ +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ="America/New_York" + +# Base OS deps (build tools + common libs) + libpng for matplotlib +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl ca-certificates git unzip zip \ + build-essential pkg-config \ + zlib1g-dev libbz2-dev liblzma-dev \ + libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \ + libfreetype6-dev libpng-dev \ + python3-venv python3-distutils python3-dev \ + # bio tools via apt instead of building + samtools tabix \ + perl \ + && rm -rf /var/lib/apt/lists/* + +# Install CPython 3.9.17 from source +ARG PYTHON_VERSION=3.9.17 +RUN set -eux; \ + cd /tmp; \ + curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \ + tar -xzf Python-${PYTHON_VERSION}.tgz; \ + cd Python-${PYTHON_VERSION}; \ + ./configure --enable-optimizations; \ + make -j"$(nproc)"; \ + make install; \ + cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \ + ln -s /usr/local/bin/python3 /usr/local/bin/python; \ + ln -s /usr/local/bin/pip3 /usr/local/bin/pip + +# Make constraints global for all pip installs +COPY constraints.txt /tmp/constraints.txt +ENV PIP_CONSTRAINT=/tmp/constraints.txt \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=180 + +# Clean any existing numpy/matplotlib remnants aggressively +RUN python - <<'PY' +import sys, site, pkgutil, shutil, pathlib +paths = set(site.getsitepackages() + [site.getusersitepackages()]) +for p in list(paths): + if not p: + continue + for name in ("numpy", "matplotlib"): + for m in pathlib.Path(p).glob(name): + shutil.rmtree(m, ignore_errors=True) + for m in pathlib.Path(p).glob(f"{name}-*.dist-info"): + shutil.rmtree(m, ignore_errors=True) + for m in pathlib.Path(p).glob(f"{name}-*.egg-info"): + shutil.rmtree(m, ignore_errors=True) +print("Cleaned numpy/matplotlib from:", *paths, sep="\n - ") +PY + +# Install bedtools +RUN apt-get update && apt-get install -y --no-install-recommends bedtools \ + && rm -rf /var/lib/apt/lists/* + +# Install tools + exact pins +RUN python -m pip install --no-cache-dir -U pip setuptools wheel \ + && pip install --no-cache-dir --upgrade --force-reinstall \ + "numpy==1.26.4" "matplotlib==3.8.4" "cython<3" + +# Install MACS2. Build-from-source packages must reuse our pinned toolchain +RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1 + +# Install Dictys without dependencies (we'll install them manually right after) +RUN pip install --no-cache-dir --no-build-isolation --no-deps \ + git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f + +# Install Dictys dependencies and more +RUN pip install --no-cache-dir --prefer-binary \ + pandas scipy networkx h5py threadpoolctl joblib \ + jupyter jupyterlab adjustText pyro-ppl docutils requests + +# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2 +RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata + +# Install pybedtools version that works with cython<3 +RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1" + +# Install pytorch +# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch + +# HOMER prerequisites +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget perl unzip build-essential zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install HOMER core + hg38 genome +RUN set -eux; \ + mkdir -p /opt/homer && cd /opt/homer; \ + curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \ + chmod +x configureHomer.pl; \ + perl configureHomer.pl -install homer; \ + perl configureHomer.pl -install homerTools; \ + perl configureHomer.pl -install hg38 +ENV PATH="/opt/homer/bin:${PATH}" + +# hg38 annotations +RUN set -eux; \ + cd /opt/homer; \ + grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \ + cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt + +# Install CUDA +# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \ +# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \ +# curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \ +# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ +# echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \ +# > /etc/apt/sources.list.d/cuda.list && \ +# apt-get update && apt-get install -y --no-install-recommends cuda && \ +# rm -rf /var/lib/apt/lists/* + +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/dockers/dictys_v4/constraints.txt b/dockers/dictys_v4/constraints.txt new file mode 100644 index 000000000..c5b8e303b --- /dev/null +++ b/dockers/dictys_v4/constraints.txt @@ -0,0 +1,3 @@ +numpy==1.26.4 +matplotlib<3.9 +cython<3 \ No newline at end of file diff --git a/dockers/dictys_v5/Dockerfile b/dockers/dictys_v5/Dockerfile new file mode 100644 index 000000000..cae4c569a --- /dev/null +++ b/dockers/dictys_v5/Dockerfile @@ -0,0 +1,31 @@ +FROM lfwa/dictys-cpu:latest + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ="America/New_York" + +# Install bio tools +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl unzip less groff \ + tabix samtools \ + && rm -rf /var/lib/apt/lists/* + +# Install anndata +RUN pip install --no-cache-dir anndata + +# Install AWS CLI v2 +RUN curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" \ + && unzip -q /tmp/awscliv2.zip -d /tmp \ + && /tmp/aws/install --update \ + && rm -rf /tmp/aws /tmp/awscliv2.zip + +# Clean MACS2, install everything into dictys, add MACS2 wrapper +RUN source /opt/conda/etc/profile.d/conda.sh \ + && conda activate dictys \ + && (pip uninstall -y MACS2 || true) \ + && (conda remove -y macs2 || true) \ + && conda install -y -c conda-forge -c bioconda macs3 \ + && printf '#!/usr/bin/env bash\nexec macs3 "$@"\n' > "$(conda run -n dictys --no-capture-output python -c 'import sys,os; print(os.path.join(os.path.dirname(sys.executable), "macs2"))')" \ + && chmod +x "$(conda run -n dictys --no-capture-output python -c 'import sys,os; print(os.path.join(os.path.dirname(sys.executable), "macs2"))')" \ + && conda clean -afy + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/dockers/dictys_v5/constraints.txt b/dockers/dictys_v5/constraints.txt new file mode 100644 index 000000000..c5b8e303b --- /dev/null +++ b/dockers/dictys_v5/constraints.txt @@ -0,0 +1,3 @@ +numpy==1.26.4 +matplotlib<3.9 +cython<3 \ No newline at end of file diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py index 90cde9b03..a80373a3b 100644 --- a/src/methods/dictys/helper.py +++ b/src/methods/dictys/helper.py @@ -1,6 +1,9 @@ import os os.environ["MKL_SERVICE_FORCE_INTEL"] = "1" os.environ["MKL_THREADING_LAYER"] = "GNU" +import shutil +import json +from typing import Optional, List import numpy as np import pandas as pd @@ -11,6 +14,31 @@ warnings.filterwarnings("ignore") +OVERRIDE_DOWNLOAD = True + + +def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None: + kwargs = {} + if cwd is not None: + kwargs['cwd'] = cwd + with subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + **kwargs + ) as proc: + for line in proc.stdout: + print(line, end="") + rc = proc.wait() + if rc != 0: + raise RuntimeError(f"Command {cmd} failed with exit code {rc}") + + +def run_cmd_inside_dictys(cmd: List[str], cwd: Optional[str] = None) -> None: + run_cmd(["conda", "run", "-n", "dictys", "--no-capture-output", *cmd], cwd=cwd) + def define_vars(par): os.makedirs(par['temp_dir'], exist_ok=True) @@ -28,6 +56,7 @@ def define_vars(par): par['bams_dir'] = f"{par['data_dir']}/bams/" par['gene_bed'] = f"{par['data_dir']}/gene.bed" + par['make_dir'] = f"{par['temp_dir']}/makefiles" def extract_exp(par): @@ -88,6 +117,7 @@ def extract_atac(par): print(f'Sort and compress tsv file {frags_path}') os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}") + def create_bam(par): print('Creating BAM file from fragments', flush=True) cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}" @@ -107,9 +137,24 @@ def bam_to_bams(par): - 'bams_dir': path to output folder for per-cell BAMs - 'exp_path': path to reference expression file """ + + print('Delete temp BAM directories', flush=True) + folders = [ + par['bams_dir'], + os.path.join(par['bams_dir'], '..', 'bams_text'), + os.path.join(par['bams_dir'], '..', 'bams_header') + ] + for folder in folders: + if os.path.exists(folder): + shutil.rmtree(folder) + print('Splitting BAM into per-cell BAMs', flush=True) - cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}" - run_cmd(cmd) + run_cmd_inside_dictys([ + "dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'], + "--section", "CB:Z:", "--ref_expression", par['exp_path'] + ]) + + def extrac_clusters(par): print('Extracting clusters', flush=True) subsets = f"{par['data_dir']}/subsets.txt" @@ -127,15 +172,6 @@ def extrac_clusters(par): subprocess.run(cp, shell=True, check=True) print('Extracting clusters successful', flush=True) -def run_cmd(cmd): - try: - result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True) - print("STDOUT:", result.stdout) - print("STDERR:", result.stderr) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("STDOUT:", e.stdout) - print("STDERR:", e.stderr) def download_file(url, dest): import requests @@ -145,21 +181,27 @@ def download_file(url, dest): with open(dest, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) + + def get_priors(par): import gzip import shutil # - get the genome print('Getting genome ...', flush=True) - os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True) - cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request" - try: - run_cmd(cmd) - except: + if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")): + os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True) try: - cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/" - run_cmd(cmd) + run_cmd([ + "aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa", + f"{par['data_dir']}/genome/", "--no-sign-request" + ]) except: - raise ValueError("Could not get the genome") + try: + run_cmd([ + "cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/" + ]) + except: + raise ValueError("Could not get the reference genome") # - get gene annotation print('Getting gene annotation ...', flush=True) @@ -167,29 +209,33 @@ def get_priors(par): gtf_gz = data_dir / "gene.gtf.gz" gtf = data_dir / "gene.gtf" url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz" - download_file(url, gtf_gz) + if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)): + download_file(url, gtf_gz) with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out: shutil.copyfileobj(f_in, f_out) gtf_gz.unlink() - cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}" print('Making bed files for gene annotation ...', flush=True) - run_cmd(cmd) + run_cmd_inside_dictys([ + "dictys_helper", "gene_gtf.sh", str(gtf), str(par['gene_bed']) + ]) print('Downloading motif file...', flush=True) url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif' motif_file = data_dir / 'motifs.motif' - download_file(url, motif_file) - + if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)): + download_file(url, motif_file) + + def configure(par): - import json - device='cuda:0' #cuda:0 , cpu - par['make_dir'] = f"{par['temp_dir']}/makefiles" + + device = 'cpu' os.makedirs(par['make_dir'], exist_ok=True) - cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk" - run_cmd(cmd) + run_cmd_inside_dictys([ + "dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk" + ], cwd=par['make_dir']) json_arg = json.dumps({ "DEVICE": device, @@ -197,14 +243,23 @@ def configure(par): "JOINT": "1" }) - cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'" - run_cmd(cmd) - cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py" - run_cmd(cmd) + run_cmd_inside_dictys([ + "dictys_helper", "makefile_update.py", "config.mk", json_arg + ], cwd=par['make_dir']) + + run_cmd_inside_dictys([ + "dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'], + "--dir_data", par['data_dir'] + ]) + + def infer_grn(par): print('Inferring GRNs', flush=True) - cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static" - run_cmd(cmd) + run_cmd_inside_dictys([ + "dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static" + ], cwd=par['temp_dir']) + + def export_net(par): from util import process_links from dictys.net import network @@ -223,9 +278,10 @@ def export_net(par): output = ad.AnnData(X=None, uns={"method_id": "dictys", "dataset_id": ad.read_h5ad(par['rna'], backed='r').uns['dataset_id'], "prediction": net[["source", "target", "weight"]]}) output.write(par['prediction']) + def main(par): - define_vars(par) + define_vars(par) extract_exp(par) extract_atac(par) create_bam(par) diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py index 481d09f25..bea5317f4 100644 --- a/src/metrics/sem/helper.py +++ b/src/metrics/sem/helper.py @@ -26,7 +26,7 @@ MAX_N_ITER = 500 # For reproducibility purposes -seed = 0xCADD +seed = 0xCAFF os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) @@ -242,8 +242,11 @@ def evaluate_grn( # Learn initial GRN weights from the first set print("Infer GRN weights from training set, given the provided GRN topology") + signed = False A = regression_based_grn_inference(X_controls, A, signed=signed) - # A = spearman_based_grn_inference(X_controls, A, signed=signed) + + print(np.mean(A != 0)) + exit() # Learn shocks from perturbations, using reporter genes only print("Learn shocks from perturbations, using reporter genes only") @@ -279,13 +282,16 @@ def evaluate_grn( best_iteration = 0 pbar = tqdm.tqdm(range(n_iter)) X_non_reporter = delta_X_train[:, ~is_reporter] + losses = [] for iteration in pbar: optimizer.zero_grad() A_eff = torch.abs(A) * signs if signed else A * mask + delta_X_hat = solve_sem(A_eff, delta_train) loss = torch.mean(torch.sum(torch.square(X_non_reporter - delta_X_hat[:, ~is_reporter]), dim=1)) loss = loss + 0.001 * torch.sum(torch.abs(A)) loss = loss + 0.001 * torch.sum(torch.square(A)) + losses.append(loss.item()) pbar.set_description(str(loss.item())) # Keep track of best solution @@ -298,6 +304,7 @@ def evaluate_grn( optimizer.step() scheduler.step(loss.item()) # pbar.set_description(str(loss.item())) + A = best_A_eff mask = mask.detach().cpu().numpy().astype(bool) print(f"Best iteration: {best_iteration + 1} / {n_iter}") @@ -389,6 +396,10 @@ def main(par): # Get negative controls X_controls = X[are_controls, :] + if np.all(np.std(X_controls, axis=0) == 0): + raise RuntimeError("All negative controls have the same expression profile") + + # Compute perturbations delta_X = compute_perturbations(X, are_controls, match_groups, loose_match_groups) # Remove negative controls from downstream analysis