Skip to content

Commit d42e8a2

Browse files
authored
Merge pull request #3 from ocisse/master
Update VirFac pipeline Somebody brought this repo to my attention after a while and I noticed this outstanding PR I missed back in the day. Sorry Ousmane! I'm accepting it now.
2 parents 9b6f44f + bcd8561 commit d42e8a2

File tree

7 files changed

+268
-8
lines changed

7 files changed

+268
-8
lines changed

scripts/compare_two_metaG.pl

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env perl -w
2+
#
3+
#
4+
use IO::All;
5+
use feature 'say';
6+
use Carp;
7+
use strict;
8+
use Data::Dumper;
9+
use Array::Utils qw(:all);
10+
11+
my ($hmmOut1,$hmmOut2) = @ARGV;
12+
13+
# get the domains => key: domain; val: (ctg_id,full_evalue,aln_start,aln_end,#copies)
14+
my %hmmOut1 = get_domains_info($hmmOut1);
15+
my %hmmOut2 = get_domains_info($hmmOut2);
16+
17+
#say Dumper \%hmmOut1;
18+
19+
# compare the two hashes
20+
compare_metagenome(\%hmmOut1,\%hmmOut2);
21+
22+
23+
24+
25+
# sub
26+
sub compare_metagenome {
27+
28+
my ($m1,$m2) = @_;
29+
30+
my %m1 = %$m1;
31+
my %m2 = %$m2;
32+
33+
# easiest way - do they have the same domains
34+
my @dom1 = (keys %m1);
35+
my @dom2 = (keys %m2);
36+
my @isect = intersect(@dom1, @dom2); # domains present in both datasets
37+
#print join(",", @isect) . "\n";
38+
my @diff = array_diff(@dom1, @dom2);
39+
my @unique = unique(@dom1,@dom2);
40+
41+
warn"...\t# of domains found in the two datasets:\t". @isect ."\n";
42+
warn"...\t# of domains that are different\t". @diff ."\n";
43+
warn"...\t# of domains that are unique\t". @unique ."\n";
44+
45+
}
46+
sub get_domains_info {
47+
my %h = ();
48+
my $f = io(@_);
49+
$f->autoclose(0);
50+
while(my $l = $f->getline || $f->getline){
51+
chomp $l;
52+
next if $l =~m/^#/;
53+
my @data = split /\s+/, $l;
54+
my ($target,$qry,$eval,$alnS,$alnE) = ($data[0],$data[3],$data[6],$data[19],$data[20]);
55+
@{$h{$qry}} = ($target,$eval,$alnS,$alnE);
56+
}
57+
return(%h);
58+
}
59+

scripts/generate_sample_yaml.pl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/data/ousmane.cisse/conda/bin/perl -w
2+
#
3+
#
4+
use IO::All;
5+
use feature 'say';
6+
use Carp;
7+
use strict;
8+
use Data::Dumper;
9+
10+
my $dir = io($ARGV[0]);
11+
my @contents = @$dir;
12+
13+
say "samples:";
14+
my $c = 1;
15+
16+
my $f = "";
17+
foreach $f (@contents){
18+
if ($f =~ m/\.fasta/){
19+
say " G$c: $f";
20+
$c++;
21+
}
22+
}

src/CMD.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
## RUN HMMs
3+
## generate config.yaml for samples
4+
perl ../scripts/generate_sample_yaml.pl /data/pathogens/ > config_pathogens.yaml
5+
perl ../scripts/generate_sample_yaml.pl /data/healthy_contigs/ > config_healthy.yaml
6+
7+
# Snakemake is not working yet
8+
# do it manually
9+
10+
ls /data/pathogens/*.fasta | while read f; do \
11+
hmmsearch -E 1e-5 --cpu 8 --domtblout $f.tab \
12+
/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_nt.hmm \
13+
$f;done
14+
15+
ls /data/healthy_contigs/*.fasta | while read f; do \
16+
hmmsearch -E 1e-5 --cpu 8 --domtblout $f.tab \
17+
/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_nt.hmm \
18+
$f;done
19+

src/Snakefile

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
rule all:
2-
input:
3-
"../data/processed/VFDB/tmp.txt"
1+
configfile: "config_pathogens.yaml"
2+
3+
#rule all:
4+
# input:
5+
# "/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/SRR8247573.nt.tab"
6+
# "../data/processed/VFDB/HMM/tmp.txt"
47

58
rule get_genes:
69
input:
@@ -14,8 +17,8 @@ rule get_genes:
1417
setBnt="../data/processed/VFDB/setA_pro.fas.Selected_Genes_info.txt",
1518
setBprot="../data/processed/VFDB/setB_pro.fas.Selected_Genes_info.txt"
1619

17-
conda:
18-
"envs/meta.yaml"
20+
# conda:
21+
# "envs/meta.yaml"
1922

2023
threads: 1
2124

@@ -37,11 +40,35 @@ rule build_hmm:
3740
setBp="../data/raw/VFDB/VFDB_setB_pro.fas",
3841

3942
output:
40-
"../data/processed/VFDB/tmp.txt"
43+
"../data/processed/VFDB/HMM/tmp.txt"
4144

4245
run:
43-
shell("perl ../scripts/buil_hmm.pl {input.setAnt} {input.setAn}")
44-
shell("perl ../scripts/buil_hmm.pl {input.setAprot} {input.setAp}")
46+
#shell("echo > $output")
47+
# shell("perl ../scripts/buil_hmm.pl {input.setAnt} {input.setAn}")
48+
# shell("perl ../scripts/buil_hmm.pl {input.setAprot} {input.setAp}")
4549
shell("perl ../scripts/buil_hmm.pl {input.setBnt} {input.setBn}")
4650
shell("perl ../scripts/buil_hmm.pl {input.setBprot} {input.setBp}")
4751

52+
53+
54+
# randomly picked two for test
55+
rule test_hmm:
56+
input:
57+
hmmP="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_pro.hmm",
58+
hmmN="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_nt.hmm",
59+
fasta=expand("sample=config["samples"])
60+
# t1="/data/SRP170931/spades_contigs/SRR8247572.fasta",
61+
# t2="/data/SRP170931/spades_contigs/SRR8247573.fasta"
62+
output:
63+
temp("/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/pathogens/{sample}.tab")
64+
# t1="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/SRR8247572.nt.tab",
65+
# t2="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/SRR8247573.nt.tab"
66+
params:
67+
eval="1e-10"
68+
69+
threads: 8
70+
# run:
71+
# shell("hmmsearch -E {params.eval} --cpu {threads} -Z 21154 --domtblout {output.t1} {input.hmmN} {input.t1}")
72+
# shell("hmmsearch -E {params.eval} --cpu {threads} -Z 21154 --domtblout {output.t2} {input.hmmN} {input.t2}")
73+
shell:
74+
"hmmsearch -E {params.eval} --cpu {threads} --domtblout {output} {input.hmmN} {input.fasta}"

src/Snakefile_pathogens

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
configfile: "config_pathogens.yaml"
2+
3+
rule all:
4+
input:
5+
expand("/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/pathogens/{sample}.tab", sample=config["samples"])
6+
7+
rule testhmm:
8+
input:
9+
hmmP="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_pro.hmm",
10+
hmmN="/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/VFDB_setA_nt.hmm",
11+
fasta=expand("/data/pathogens/{sample}", sample=config["samples"])
12+
13+
output:
14+
"/home/ousmane.cisse/DATA/NCBI_hack/data/processed/VFDB/pathogens/{sample}.tab"
15+
16+
params:
17+
eval="1e-10"
18+
19+
threads: 8
20+
21+
shell:
22+
"hmmsearch -E {params.eval} --cpu {threads} --domtblout {output} {input.hmmN} {input.fasta}"

src/config_healthy.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
samples:
2+
G1: /data/healthy_contigs/ERR2983314.fasta
3+
G2: /data/healthy_contigs/ERR2983315.fasta
4+
G3: /data/healthy_contigs/ERR2983316.fasta
5+
G4: /data/healthy_contigs/ERR2983317.fasta
6+
G5: /data/healthy_contigs/ERR2983318.fasta
7+
G6: /data/healthy_contigs/ERR2983319.fasta
8+
G7: /data/healthy_contigs/ERR2983320.fasta
9+
G8: /data/healthy_contigs/ERR2983321.fasta
10+
G9: /data/healthy_contigs/ERR2983322.fasta
11+
G10: /data/healthy_contigs/ERR2983325.fasta
12+
G11: /data/healthy_contigs/ERR2983326.fasta
13+
G12: /data/healthy_contigs/ERR2983328.fasta
14+
G13: /data/healthy_contigs/ERR2983331.fasta

src/config_pathogens.yaml

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
samples:
2+
G1: /data/pathogens/acinetobacter_baumannii_acicu.fasta
3+
G2: /data/pathogens/aeromonas_hydrophila_ml09119.fasta
4+
G3: /data/pathogens/aeromonas_hydrophila_subsp_hydrophila_atcc_7966.fasta
5+
G4: /data/pathogens/aeromonas_salmonicida_subsp_salmonicida_a449.fasta
6+
G5: /data/pathogens/aeromonas_veronii_b565.fasta
7+
G6: /data/pathogens/anaplasma_phagocytophilum_hz.fasta
8+
G7: /data/pathogens/bacillus_anthracis.fasta
9+
G8: /data/pathogens/bacillus_anthracis_str_ames_ancestor.fasta
10+
G9: /data/pathogens/bacillus_anthracis_str_sterne.fasta
11+
G10: /data/pathogens/bacillus_cereus_atcc_10987.fasta
12+
G11: /data/pathogens/bacillus_cereus_atcc_14579.fasta
13+
G12: /data/pathogens/bacillus_subtilis_subsp_subtilis_str_168.fasta
14+
G13: /data/pathogens/bartonella_henselae_str_houston1.fasta
15+
G14: /data/pathogens/bartonella_quintana_str_toulouse.fasta
16+
G15: /data/pathogens/bordetella_pertussis_tohama_i.fasta
17+
G16: /data/pathogens/brucella_melitensis_bv_1_str_16m.fasta
18+
G17: /data/pathogens/brucella_suis_1330.fasta
19+
G18: /data/pathogens/burkholderia_pseudomallei_k96243.fasta
20+
G19: /data/pathogens/campylobacter_jejuni_subsp_jejuni_81176.fasta
21+
G20: /data/pathogens/campylobacter_jejuni_subsp_jejuni_nctc_11168.fasta
22+
G21: /data/pathogens/chlamydia_trachomatis_duw3cx.fasta
23+
G22: /data/pathogens/clostridium_botulinum_d_str_1873.fasta
24+
G23: /data/pathogens/clostridium_difficile_630.fasta
25+
G24: /data/pathogens/clostridium_perfringens_atcc_13124.fasta
26+
G25: /data/pathogens/clostridium_perfringens_b_str_atcc_3626.fasta
27+
G26: /data/pathogens/clostridium_perfringens_sm101.fasta
28+
G27: /data/pathogens/clostridium_perfringens_str_13.fasta
29+
G28: /data/pathogens/clostridium_perfringens_str_nctc_8533b4d.fasta
30+
G29: /data/pathogens/clostridium_tetani_e88.fasta
31+
G30: /data/pathogens/corynebacterium_diphtheriae_nctc_13129.fasta
32+
G31: /data/pathogens/coxiella_burnetii_cbugq212.fasta
33+
G32: /data/pathogens/coxiella_burnetii_cbukq154.fasta
34+
G33: /data/pathogens/coxiella_burnetii_dugway_5j108111.fasta
35+
G34: /data/pathogens/coxiella_burnetii_rsa_331.fasta
36+
G35: /data/pathogens/coxiella_burnetii_rsa_493.fasta
37+
G36: /data/pathogens/enterococcus_faecalis_v583.fasta
38+
G37: /data/pathogens/enterococcus_faecium_do.fasta
39+
G38: /data/pathogens/escherichia_coli.fasta
40+
G39: /data/pathogens/escherichia_coli_55989.fasta
41+
G40: /data/pathogens/escherichia_coli_b171.fasta
42+
G41: /data/pathogens/escherichia_coli_c34262.fasta
43+
G42: /data/pathogens/escherichia_coli_cft073.fasta
44+
G43: /data/pathogens/escherichia_coli_se11.fasta
45+
G44: /data/pathogens/escherichia_coli_str_042.fasta
46+
G45: /data/pathogens/escherichia_coli_str_111kh86.fasta
47+
G46: /data/pathogens/escherichia_coli_uti89.fasta
48+
G47: /data/pathogens/haemophilus_influenzae_rd_kw20.fasta
49+
G48: /data/pathogens/haemophilus_influenzae_tn106.fasta
50+
G49: /data/pathogens/helicobacter_pylori_26695.fasta
51+
G50: /data/pathogens/helicobacter_pylori_j99.fasta
52+
G51: /data/pathogens/klebsiella_pneumoniae_subsp_pneumoniae_1084.fasta
53+
G52: /data/pathogens/klebsiella_pneumoniae_subsp_pneumoniae_hs11286.fasta
54+
G53: /data/pathogens/klebsiella_pneumoniae_subsp_pneumoniae_ntuhk2044.fasta
55+
G54: /data/pathogens/legionella_pneumophila_subsp_pneumophila_str_philadelphia_1.fasta
56+
G55: /data/pathogens/listeria_innocua_slcc6294.fasta
57+
G56: /data/pathogens/listeria_monocytogenes_egde.fasta
58+
G57: /data/pathogens/mycobacterium_tuberculosis_h37rv.fasta
59+
G58: /data/pathogens/mycoplasma_hyopneumoniae_232.fasta
60+
G59: /data/pathogens/mycoplasma_pneumoniae_m129.fasta
61+
G60: /data/pathogens/neisseria_meningitidis_mc58.fasta
62+
G61: /data/pathogens/neisseria_meningitidis_z2491.fasta
63+
G62: /data/pathogens/pathogens.fasta
64+
G63: /data/pathogens/pseudomonas_aeruginosa_pa103.fasta
65+
G64: /data/pathogens/pseudomonas_aeruginosa_pao1.fasta
66+
G65: /data/pathogens/rickettsia_conorii_str_malish_7.fasta
67+
G66: /data/pathogens/salmonella_enterica_serovar_typhimurium.fasta
68+
G67: /data/pathogens/salmonella_enterica_subsp_enterica_serovar_typhi_str_ct18.fasta
69+
G68: /data/pathogens/salmonella_enterica_subsp_enterica_serovar_typhimurium_str_14028s.fasta
70+
G69: /data/pathogens/salmonella_enterica_subsp_enterica_serovar_typhimurium_str_lt2.fasta
71+
G70: /data/pathogens/shigella_dysenteriae_sd197.fasta
72+
G71: /data/pathogens/shigella_flexneri_2a_str_301.fasta
73+
G72: /data/pathogens/staphylococcus_aureus.fasta
74+
G73: /data/pathogens/staphylococcus_aureus_rn4220.fasta
75+
G74: /data/pathogens/staphylococcus_aureus_subsp_aureus_col.fasta
76+
G75: /data/pathogens/staphylococcus_aureus_subsp_aureus_mw2.fasta
77+
G76: /data/pathogens/staphylococcus_aureus_subsp_aureus_n315.fasta
78+
G77: /data/pathogens/staphylococcus_aureus_subsp_aureus_str_newman.fasta
79+
G78: /data/pathogens/streptococcus_agalactiae_2603vr.fasta
80+
G79: /data/pathogens/streptococcus_agalactiae_a909.fasta
81+
G80: /data/pathogens/streptococcus_agalactiae_fm027022.fasta
82+
G81: /data/pathogens/streptococcus_agalactiae_nem316.fasta
83+
G82: /data/pathogens/streptococcus_pneumoniae_r6.fasta
84+
G83: /data/pathogens/streptococcus_pneumoniae_taiwan19f14.fasta
85+
G84: /data/pathogens/streptococcus_pneumoniae_tigr4.fasta
86+
G85: /data/pathogens/streptococcus_pyogenes_m1_gas.fasta
87+
G86: /data/pathogens/streptococcus_pyogenes_mgas315.fasta
88+
G87: /data/pathogens/streptococcus_pyogenes_mgas5005.fasta
89+
G88: /data/pathogens/streptococcus_pyogenes_mgas8232.fasta
90+
G89: /data/pathogens/vibrio_cholerae_o1_biovar_el_tor_str_n16961.fasta
91+
G90: /data/pathogens/vibrio_parahaemolyticus.fasta
92+
G91: /data/pathogens/vibrio_parahaemolyticus_rimd_2210633.fasta
93+
G92: /data/pathogens/vibrio_vulnificus_cmcp6.fasta
94+
G93: /data/pathogens/vibrio_vulnificus_yj016.fasta
95+
G94: /data/pathogens/yersinia_enterocolitica_subsp_enterocolitica_8081.fasta
96+
G95: /data/pathogens/yersinia_enterocolitica_w1024.fasta
97+
G96: /data/pathogens/yersinia_pestis_co92.fasta

0 commit comments

Comments
 (0)