Skip to content

Commit 0ebdc00

Browse files
committed
Refactor indels indexing and search workflow
Streamlines indels indexing by moving it to a dedicated step after genome indexing, updates directory naming conventions to use bMax, and improves error handling and logging. Cleans up variant enrichment logic, removes redundant code, and standardizes log messages for off-target and indels search steps.
1 parent 4f754f4 commit 0ebdc00

File tree

2 files changed

+60
-62
lines changed

2 files changed

+60
-62
lines changed

PostProcess/pool_search_indels.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@ def search_indels(f):
3232
print("Searching for INDELs in", chrom)
3333
if bDNA != "0" or bRNA != "0":
3434
os.system(
35-
f"crispritz.py search {current_working_directory}/genome_library/{true_pam}_2_{ref_name}+{vcf_name}_INDELS/{true_pam}_2_fake{chrom}/ {pam_file} {guide_file} fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA} -index -mm {mm} -bDNA {bDNA} -bRNA {bRNA} -t -th 1 >/dev/null"
35+
f"crispritz.py search {current_working_directory}/genome_library/{true_pam}_{bMax}_{ref_name}+{vcf_name}_INDELS/{true_pam}_{bMax}_fake{chrom}/ {pam_file} {guide_file} fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA} -index -mm {mm} -bDNA {bDNA} -bRNA {bRNA} -t -th 1 >/dev/null"
3636
)
3737
else:
38-
print("faccio ricerca brute")
3938
os.system(
4039
f"crispritz.py search {current_working_directory}/Genomes/{ref_name}+{vcf_name}_INDELS/fake_{vcf_name}_{chrom}/ {pam_file} {guide_file} fake{chrom}_{pam_name}_{guide_name}_{mm}_{bDNA}_{bRNA} -mm {mm} -t -th 1 >/dev/null"
4140
)

PostProcess/submit_job_automated_new_multiple_vcfs.sh

Lines changed: 59 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,8 @@ while read vcf_f; do
169169
dict_folder="$current_working_directory/Dictionaries/dictionaries_${vcf_name}/"
170170
indel_dict_folder="$current_working_directory/Dictionaries/log_indels_${vcf_name}/"
171171
indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS"
172-
indels_index_dir="$current_working_directory/genome_library/${true_pam}_2_${ref_name}+${vcf_name}_INDELS"
173172

174-
if ![ -d "$enriched_folder" ]; then
173+
if ! [ -d "$enriched_folder" ]; then
175174
echo -e 'Add-variants\tStart\t'$(date) >>$log
176175
echo -e "Adding variants"
177176

@@ -181,70 +180,39 @@ while read vcf_f; do
181180
if [ -s $logerror ]; then
182181
printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2
183182
# since failure happened, force genome enrichment to be repeated
184-
rm -r "$enriched_folder"* "$variants_tmp" || true
183+
rm -r "$enriched_folder"* "$variants_tmp"
185184
exit 1
186185
fi
187186

188-
# move enriched snp genome
189-
mv "$variants_tmp/SNPs_genome/${ref_name}_enriched/" "$enriched_dir/"
187+
# create indels folder
188+
mkdir -p $indels_out
189+
190+
# move enriched snp genome and indels
191+
mv "$variants_tmp/SNPs_genome/${ref_name}_enriched/" "$enriched_folder/"
192+
mv $variants_tmp/fake* $indels_out
190193

191194
# create dictionaries folders if needed
192195
mkdir -p "$dict_folder" "$indel_dict_folder"
193196

194-
# move annotation files
195-
mv "$variants_tmp/SNPs_genome/"*.json "$dict_folder"
196-
mv "$variants_tmp/SNPs_genome/log"*.txt "$indel_dict_folder"
197-
198-
echo -e 'Add-variants\tEnd\t'$(date) >>"$log"
199-
200-
# START STEP 1.1 - indels indexing
201-
echo -e 'Indexing Indels\tStart\t'$(date) >>"$log"
202-
"$starting_dir/pool_index_indels.py" \
203-
"$variants_tmp/" \
204-
"$pam_file" \
205-
"$true_pam" \
206-
"$ref_name" \
207-
"$vcf_name" \
208-
"$bMax" \
209-
"$ncpus"
210-
211-
if [ -s "$logerror" ]; then
212-
printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2
213-
rm -r "$enriched_dir"* "$variants_tmp" || true
214-
exit 1
215-
fi
216-
echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log"
197+
# move variant annotation files
198+
mv $variants_tmp/SNPs_genome/*.json "$dict_folder"
199+
mv $variants_tmp/SNPs_genome/log*.txt "$indel_dict_folder"
217200

218-
mkdir -p "$indels_out"
219-
mv "$variants_tmp/fake"* "$indels_out"
201+
# remove temporary variant genome folder
220202
rm -r "$variants_tmp"
221-
# END STEP 1.1 - indels indexing
203+
204+
echo -e 'Add-variants\tEnd\t'$(date) >>"$log"
222205
else
223206
echo -e "Variants already added"
224207
fi
225208

226-
# START STEP 1.1 - indels indexing
227-
# indels indexing if skipped in previous block
228-
if ! [ -d "$indels_index_dir" ]; then
229-
echo -e 'Indexing Indels\tStart\t'$(date) >>"$log"
230-
"$starting_dir/pool_index_indels.py" \
231-
"$variants_tmp/" \
232-
"$pam_file" \
233-
"$true_pam" \
234-
"$ref_name" \
235-
"$vcf_name" \
236-
"$bMax" \
237-
"$ncpus"
238-
239-
if [ -s "$logerror" ]; then
240-
printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2
241-
rm -rf "$enriched_dir"* "$variants_tmp" || true
242-
exit 1
243-
fi
244-
echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log"
245-
fi
246-
# END STEP 1.1 - indels indexing
209+
cd $current_working_directory
210+
fi
247211

212+
# check if anything odd happened during enrichment
213+
if [ -s $logerror ]; then
214+
printf "ERROR: Genome enrichment failed on %s\n" "$vcf_name" >&2
215+
exit 1
248216
fi
249217
# END STEP 1 - Genome enrichment
250218

@@ -266,7 +234,7 @@ while read vcf_f; do
266234
echo "Reference Index already present"
267235
echo -e 'Index-genome Reference\tEnd\t'$(date) >>"$log"
268236
idx_ref="$idx_folder1"
269-
elif [ -d "$idx_folder2"]; then
237+
elif [ -d "$idx_folder2" ]; then
270238
echo "Reference Index already present"
271239
echo -e 'Index-genome Reference\tEnd\t'$(date) >>"$log"
272240
idx_ref="$idx_folder2"
@@ -331,6 +299,38 @@ while read vcf_f; do
331299
echo -e 'Index-genome Variant\tEnd\t'$(date) >>"$log"
332300
idx_var="$idx_folder1"
333301
fi
302+
303+
# START STEP 2.3 - indels indexing
304+
indels_index_dir="$current_working_directory/genome_library/${true_pam}_${bMax}_${ref_name}+${vcf_name}_INDELS"
305+
indels_out="$current_working_directory/Genomes/${ref_name}+${vcf_name}_INDELS"
306+
307+
if ! [ -d "$indels_index_dir" ]; then
308+
echo -e 'Indexing Indels\tStart\t'$(date) >>"$log"
309+
"$starting_dir/pool_index_indels.py" \
310+
"$indels_out/" \
311+
"$pam_file" \
312+
"$true_pam" \
313+
"$ref_name" \
314+
"$vcf_name" \
315+
"$bMax" \
316+
"$ncpus"
317+
318+
if [ -s "$logerror" ]; then
319+
printf "ERROR: indels indexing failed on %s\n" "$vcf_name" >&2
320+
[ -d "$indels_index_dir" ] && rm -r "$indels_index_dir"
321+
exit 1
322+
fi
323+
324+
echo -e 'Indexing Indels\tEnd\t'$(date) >>"$log"
325+
else
326+
echo "Indels Index already present"
327+
fi
328+
# END STEP 2.3 - indels indexing
329+
fi
330+
331+
if [ -s $logerror ]; then
332+
printf "ERROR: Genome indexing failed" >&2
333+
exit 1
334334
fi
335335
# END STEP 2 - genome indexing
336336

@@ -346,9 +346,9 @@ while read vcf_f; do
346346
pids=() # reset process ids
347347
names=()
348348
# TODO: ricerca ref lanciata in parallelo con ricerca alternative
349-
echo -e 'Off-targets search\Start\t'$(date) >>$log
349+
echo -e 'Off-targets search\tStart\t'$(date) >>$log
350350
if ! [ -f "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then
351-
echo -e 'Search Reference\tStart\t'$(date) >>$log # off-targets search on reference genome
351+
echo -e 'Search Reference Start' # off-targets search on reference genome
352352
if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then # no bulges
353353
crispritz.py search $idx_ref "$pam_file" "$guide_file" "${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result &
354354
pid_search_ref=$!
@@ -370,14 +370,15 @@ while read vcf_f; do
370370
exit 1
371371
fi
372372
fi
373+
echo -e 'Search Reference completed'
373374
else
374375
echo -e "Search for reference already done"
375376
fi
376377

377378
if [ "$vcf_name" != "_" ]; then
378379
# TODO: search in parallel on ref and alt
379380
if ! [ -f "$output_folder/crispritz_targets/${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then
380-
echo -e 'Search Variant\tStart\t'$(date) >>$log # search off-targets on alternative genomes (snps only)
381+
echo -e 'Search Variant Start' # search off-targets on alternative genomes (snps only)
381382
if [ "$bDNA" -ne 0 ] || [ "$bRNA" -ne 0 ]; then # no bulge
382383
crispritz.py search "$idx_var" "$pam_file" "$guide_file" "${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}" -index -mm $mm -bDNA $bDNA -bRNA $bRNA -t -th $ceiling_result -var &
383384
pid_search_var=$!
@@ -399,15 +400,14 @@ while read vcf_f; do
399400
rm -r $output_folder/*.targets.txt $output_folder/*profile* # delete results folder
400401
exit 1
401402
fi
402-
echo -e 'Search Variant\tEnd\t'$(date) >>$log
403403
fi
404404
else
405405
echo -e "Search for variant already done"
406406
fi
407+
echo -e 'Search Variant completed'
407408

408409
if ! [ -f "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" ]; then
409410
echo -e "Search INDELs Start"
410-
echo -e 'Search INDELs\tStart\t'$(date) >>$log
411411
cd $starting_dir
412412
# TODO: REMOVE POOL SCRIPT FROM PROCESSING
413413
./pool_search_indels.py "$ref_folder" "$vcf_folder" "$vcf_name" "$guide_file" "$pam_file" $bMax $mm $bDNA $bRNA "$output_folder" $true_pam "$current_working_directory/" "$ncpus"
@@ -418,8 +418,7 @@ while read vcf_f; do
418418
fi
419419
awk '($3 !~ "n") {print $0}' "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp"
420420
mv "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.tmp" "$output_folder/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt"
421-
echo -e "Search INDELs End"
422-
echo -e 'Search INDELs\tEnd\t'$(date) >>$log
421+
echo -e "Search INDELs completed"
423422
else
424423
echo -e "Search INDELs already done"
425424
fi

0 commit comments

Comments
 (0)