Skip to content

Commit 5a42185

Browse files
authored
Merge pull request #83 from pinellolab/v218
Release v218 This PR introduces major improvements in off-target detection, annotation efficiency, and command-line usability. Key Updates: - Comprehensive off-target nomination; integrated support for alignments featuring gaps in both gRNA and spacer sequences, as well as spacer alignments initiating with gaps. (Requires CRISPRitz v2.7.0) - Refined CFD scoring; adjusted the CFD (Cutting Frequency Determination) computation to correctly account for the expanded alignment configurations introduced in this version. - Enhanced annotation pipeline; annotation now supports compressed and indexed BED files, leveraging samtools indexing for substantially faster processing particularly beneficial for genome-wide searches and large variant datasets such as gnomAD. - Fix bug in gnomAD VCF conversion pipeline - Improved CLI help for complete-search; updated and clarified help messages, providing detailed flag descriptions and emphasizing input format requirements for annotation files. Summary: This version extends the accuracy and scalability of CRISPRme’s off-target search and improves usability through optimized annotation handling and clearer documentation.
2 parents e07a459 + 2dcccf5 commit 5a42185

19 files changed

+1906
-2770
lines changed

.gitignore

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,177 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
*.pyc
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
cover/
54+
55+
# Translations
56+
*.mo
57+
*.pot
58+
59+
# Django stuff:
60+
*.log
61+
local_settings.py
62+
db.sqlite3
63+
db.sqlite3-journal
64+
65+
# Flask stuff:
66+
instance/
67+
.webassets-cache
68+
69+
# Scrapy stuff:
70+
.scrapy
71+
72+
# Sphinx documentation
73+
docs/_build/
74+
75+
# PyBuilder
76+
.pybuilder/
77+
target/
78+
79+
# Jupyter Notebook
80+
.ipynb_checkpoints
81+
82+
# IPython
83+
profile_default/
84+
ipython_config.py
85+
86+
# pyenv
87+
# For a library or package, you might want to ignore these files since the code is
88+
# intended to run in multiple environments; otherwise, check them in:
89+
# .python-version
90+
91+
# pipenv
92+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
94+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
95+
# install all needed dependencies.
96+
#Pipfile.lock
97+
98+
# poetry
99+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100+
# This is especially recommended for binary packages to ensure reproducibility, and is more
101+
# commonly ignored for libraries.
102+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103+
#poetry.lock
104+
105+
# pdm
106+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107+
#pdm.lock
108+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109+
# in version control.
110+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
111+
.pdm.toml
112+
.pdm-python
113+
.pdm-build/
114+
115+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116+
__pypackages__/
117+
118+
# Celery stuff
119+
celerybeat-schedule
120+
celerybeat.pid
121+
122+
# SageMath parsed files
123+
*.sage.py
124+
125+
# Environments
126+
.env
127+
.venv
128+
env/
129+
venv/
130+
ENV/
131+
env.bak/
132+
venv.bak/
133+
134+
# Spyder project settings
135+
.spyderproject
136+
.spyproject
137+
138+
# Rope project settings
139+
.ropeproject
140+
141+
# mkdocs documentation
142+
/site
143+
144+
# mypy
145+
.mypy_cache/
146+
.dmypy.json
147+
dmypy.json
148+
149+
# Pyre type checker
150+
.pyre/
151+
152+
# pytype static type analyzer
153+
.pytype/
154+
155+
# Cython debug symbols
156+
cython_debug/
157+
158+
# PyCharm
159+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161+
# and can be added to the global gitignore or merged into this file. For a more nuclear
162+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
163+
.idea/
164+
165+
# OSX
166+
.DS_store
167+
__MACOSX
168+
169+
# crisprme
1170
Genomes/
2171
genome_library/
3172
Results/
4173
Dictionaries/
5174
VCFs/
6-
*.pyc
7175
Gencode/
8176
Cache/
9177
*nohup*

.vscode/settings.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

PostProcess/annotation.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
""" """
2+
3+
from pysam import TabixFile
4+
from pysam.utils import SamtoolsError
5+
from typing import List, Tuple, Optional
6+
from time import time
7+
8+
import pysam
9+
import sys
10+
import os
11+
12+
TBI = "tbi"
13+
14+
def parse_commandline(args: List[str]) -> Tuple[str, str, str]:
15+
"""Parses and validates command line arguments for the annotation script.
16+
17+
Ensures the correct number of arguments are provided and that input files exist.
18+
Returns the file paths for the offtargets, annotation, and output files.
19+
20+
Args:
21+
args: List of command line arguments.
22+
23+
Returns:
24+
Tuple containing the offtargets file name, annotation file name, and output
25+
file name.
26+
27+
Raises:
28+
ValueError: If the number of arguments is incorrect.
29+
FileNotFoundError: If any of the specified files do not exist.
30+
"""
31+
if len(args) != 3: # check input arguments consistency
32+
raise ValueError("Too many/few input arguments for annotation script")
33+
offtargets_fname = args[0] # offtargets table report
34+
if not os.path.isfile(offtargets_fname):
35+
raise FileNotFoundError(f"Cannot find off-targets file {offtargets_fname}")
36+
annotation_fname = args[1]
37+
if os.path.basename(annotation_fname) != "vuoto.txt" and not os.path.isfile(annotation_fname):
38+
raise FileNotFoundError(
39+
f"Cannot find annotation files {annotation_fname}"
40+
)
41+
offtargets_out_fname = args[2] # annotated offtargets table report
42+
assert offtargets_out_fname
43+
return offtargets_fname, annotation_fname, offtargets_out_fname
44+
45+
46+
def load_annotation_bed(annotation_fname: str) -> TabixFile:
47+
"""Loads a BED annotation file and ensures a Tabix index is present.
48+
49+
Checks for the existence of a Tabix index for the given annotation BED file
50+
and creates one if necessary. Returns a TabixFile object for querying the
51+
annotation data.
52+
53+
Args:
54+
annotation_fname: Path to the annotation BED file.
55+
56+
Returns:
57+
TabixFile object for the annotation BED file.
58+
59+
Raises:
60+
SamtoolsError: If the annotation BED file cannot be loaded or indexed.
61+
"""
62+
# check that tabix index is available for all annotation bed
63+
pysam.tabix_index(annotation_fname, force=True, preset="bed")
64+
try: # return tabix indexes for each annotation bed
65+
return pysam.TabixFile(annotation_fname)
66+
except (SamtoolsError, Exception) as e:
67+
raise SamtoolsError(
68+
"An error occurred while loading Annotation BED files"
69+
) from e
70+
71+
72+
def compute_target_coords(fields: List[str]) -> Tuple[str, int, int]:
73+
"""Computes the genomic coordinates for a target from a list of fields.
74+
75+
Returns the chromosome, start, and stop positions for the target based on
76+
the provided fields.
77+
78+
Args:
79+
fields: List of string fields containing target information.
80+
81+
Returns:
82+
Tuple containing the chromosome (str), start (int), and stop (int)
83+
positions.
84+
"""
85+
start = int(fields[5]) # retrieve target start position
86+
stop = start + len(fields[1].replace("-", "")) # compute target stop
87+
return fields[4], start, stop
88+
89+
90+
def annotate_target(chrom: str, start: int, stop: int, annotation: TabixFile) -> str:
91+
"""Annotates a genomic target using a Tabix-indexed annotation file.
92+
93+
Retrieves and returns a comma-separated list of annotation features overlapping
94+
the specified genomic region.
95+
96+
Args:
97+
chrom: Chromosome name.
98+
start: Start position of the target.
99+
stop: Stop position of the target.
100+
annotation: TabixFile object for annotation data.
101+
102+
Returns:
103+
Comma-separated string of annotation features for the target.
104+
"""
105+
target_anns = {
106+
feature.split()[3]
107+
for feature in annotation.fetch(chrom, start, stop)
108+
} # retrieve annotations for current target
109+
return ",".join(sorted(target_anns)) # report as comma-separated list
110+
111+
112+
def annotate_offtargets(
113+
offtargets_fname: str, annotation: Optional[TabixFile], offtargets_out_fname: str,
114+
) -> None:
115+
"""Annotates a genomic target using a Tabix-indexed annotation file.
116+
117+
Retrieves and returns a comma-separated list of annotation features overlapping
118+
the specified genomic region.
119+
120+
Args:
121+
chrom: Chromosome name.
122+
start: Start position of the target.
123+
stop: Stop position of the target.
124+
annotation: TabixFile object for annotation data.
125+
126+
Returns:
127+
Comma-separated string of annotation features for the target.
128+
"""
129+
try:
130+
with open(offtargets_fname, mode="r") as infile, open(
131+
offtargets_out_fname, mode="w"
132+
) as outfile:
133+
header = infile.readline().strip() # preserve header in annotated report
134+
outfile.write(f"{header}\n")
135+
for offtarget in infile: # read offtargets
136+
fields = offtarget.split() # split offtarget individual fields
137+
# annotate current off-target using input annotation datasets
138+
fields[14] = (
139+
"n" if annotation is None else annotate_target(
140+
*compute_target_coords(fields), annotation
141+
)
142+
)
143+
offtarget_annotated = "\t".join(fields)
144+
outfile.write(f"{offtarget_annotated}\n") # write annotated offtarget
145+
except (IOError, Exception) as e:
146+
raise OSError(f"Annotation failed on off-targets in {offtargets_fname}") from e
147+
148+
149+
def main() -> None:
150+
"""Annotates a list of off-targets using an optional annotation dataset.
151+
152+
Reads off-targets from a file, annotates each entry with features from the
153+
provided annotation dataset, and writes the results to an output file.
154+
155+
Args:
156+
offtargets_fname: Path to the input file containing off-targets.
157+
annotation: TabixFile object for annotation data, or None if no annotation.
158+
offtargets_out_fname: Path to the output file for annotated off-targets.
159+
160+
Raises:
161+
OSError: If annotation fails due to file I/O or processing errors.
162+
"""
163+
# parse input command line arguments
164+
offtargets_fname, annotation, offtargets_out_fname = parse_commandline(sys.argv[1:])
165+
start = time() # track annotation time
166+
empty = os.path.basename(annotation) == "vuoto.txt"
167+
# load annotation bed files
168+
annotation = None if empty else load_annotation_bed(annotation)
169+
# annotate offtargets using input bed files
170+
annotate_offtargets(offtargets_fname, annotation, offtargets_out_fname)
171+
sys.stdout.write(f"Annotation completed in {time() - start:.2f}s\n")
172+
173+
174+
if __name__ == "__main__":
175+
main()

PostProcess/hg38_gnomAD.samplesID.txt

Lines changed: 0 additions & 11 deletions
This file was deleted.

PostProcess/merge_close_targets_cfd.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,4 @@ else
3131
criteria=$sorting_criteria
3232
fi
3333

34-
# python remove_contiguous_samples_cfd.py $fileIn.sorted $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd
35-
python remove_contiguous_samples_cfd.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_criteria $criteria
36-
# rm $fileIn.sorted
34+
python remove_contiguous_samples.py $fileIn $fileOut $thresh $chrom $position $total $true_guide $snp_info $cfd $sort_criteria $criteria

0 commit comments

Comments
 (0)