-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProcess_simulations_documentation.txt
More file actions
151 lines (86 loc) · 5.06 KB
/
Process_simulations_documentation.txt
File metadata and controls
151 lines (86 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#######################################################
################ simulation processing ################
#######################################################
1) Add missing data to simulations
#go to aDNA_processData/ directory
# set correct __path__ and __new_filename__ directory in qsub_Process_aDNA script and then run script
qsub qsub_Process_aDNA
# do this for all the simulations you want to process (hard, soft neutral, different demographies etc.)
1.5) Make sure you have correct number of segsites:
for file in *; do
sed -i 's/segsites: [0-9]\+/segsites: 201/g' "$file"
done
#remove empty lines
for file in *; do
sed -i '/^$/d' "$file"
#grep -c '^$' $file #count empty lines
#echo $file
done
2) Rename each file in format Model_THETA_target.txt
for i in {1..50}; do
mv MD_Neutral_Souilmi_Q1_${i}.txt NeutralSouilmiMD05_THETA0.0_target0_${i}.txt
#mv MD_SoftSweep_Souilmi_Q1_${i}.txt SoftSweepSouilmiTHETA5SelecLow_THETA5.0_target0_${i}.txt
#mv MD_HardSweep_Souilmi_Q1_${i}.txt HardSweepSouilmiSelecLow_THETA0.01_target0_${i}.txt
#mv MD_SGV_ConstantNe_Q1_${i}.txt SGVMD43_THETA0.01_target0_${i}.txt
done
for i in {1..50}; do
mv ConstantNeMD43_Neutral_THETA1to5_n150_w201_centerWindowDist_trainsrc_${i}.dat ConstantNeMD43_Neutral_THETA1to5_n150_w201_centerWindowFreq_trainsrc_${i}.dat
done
#check num of sims per file
for file in *; do
grep "segsites" "$file" | wc -l
done
2.5) (Only run this if I have to merge data from files and re-organize) Merge data
counter=1
# Loop through files in groups of 5
for i in $(seq 1 5 100); do
# Concatenate 5 files at a time
echo $counter
cat SoftSweep_1e6_H0.5_THETAvary_Q50_$i.txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+1)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+2)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+3)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+4)).txt > SoftSweep_1e6_H0.5_THETAvary_Q50_merged_$counter.txt
# Increment the counter
((counter++))
done
3) Once missing data has been added, process simulations and order haplotypes
# make sure your path directory doesn't include directories with _ (underscore) in file name
# go to directory /DANN/aDNA/ProcessingData
# Processed files will be saved in /DANN/aDNA/ProcessingData/ProcessedData -- this directory includes two sub directories:
# /DANN/aDNA/ProcessingData/ProcessedData/SortByRowDist/
# /DANN/aDNA/ProcessingData/ProcessedData/SortByRowFreq/
# Save the data oan the corresponding directory dependig on how we sort haplotypes -- each subdirectory containd directories for Hard Sweeps, Soft Sweeps , Neutral , etc
qsub_processData
4) Concatenate desired number of simulations into a .npy file to use as training input
qsub qsub_DataForTraining
# this runs the script data_for_training.py --> in this script you must define the directory of the already processed and sorted simulations as well as the name og the output file.npy.
# The code will generate three output files ex:
# neutral_ConstantNe_RowDist_n150_w201_sims.npy, HS_ConstantNe_RowDist_n150_w201_sims.npy and SS_ConstantNe_RowDist_n150_w201_sims.npy
#-----------------------------------------------------------------------------------------------------------------------------------------
qsub_GenerateData # for aDNA data
# qsub_GenerateData runs GenerateTrainingData.py
qsub_GenerateData_modernMD # for modern human data or Drosophila data
## qsub_GenerateData_modernMD runs GenerateTestData_Drosophila.py or GenerateTrainingData_modern.py
#get position data from simulations
for i in {1..50}; do
grep "positions:" ../SoftSweep_1e4_THETAvary_Q1_450Kb_${i}.txt > POS_SoftSweep_1e4_THETAvary_Q1_450Kb_${i}.txt
done
for i in {1..50}; do
mv SouilmiMiss1iMD05_Neutral_THETA1to5_n150_w201_centerWindowDist_trainsrc_${i}.dat SouilmiMiss1iMD05_Neutral_THETA1to5_n150_w201_centerWindowFreq_trainsrc_${i}.dat
done
# get num origins and theta
awk '{if ($0 ~ /REACHED PF -- outputting tree/) print prev2; prev2=prev; prev=$0}' joblog.8127162.1 > num_origins_sample.txt_1
awk '{if ($0 ~ /REACHED PF -- outputting tree/) print prev; prev=$0}' joblog.8127162.1 > num_origins.txt_1
awk '/RECHED END OF SIMULATION/{found=1; next} found {print; found=0}' joblog.8042099.50 > num_origins.txt_2
sed -i '/SIMULATION END REACHED NO SWEEP -- RESTARTING/d' num_origins.txt_2
cat num_origins.txt_1 num_origins.txt_2 > num_origins.txt
find PopGenStats/ -type f ! -exec test -e /u/project/ngarud/Garud_lab/DANN/aDNA/ProcessingData/ProcessedData/PopGenStats/{} \; -exec cp {} /u/project/ngarud/Garud_lab/DANN/aDNA/ProcessingData/ProcessedData/PopGenStats/ \;
awk '
/\/\// {slash_count++} # Count lines with `//`
slash_count >= 9000 {print} # Print all lines after the nth occurrence
' Neutral_Souilmi_Q1_20.txt > Neutral_Souilmi_Q1_20.txt_pt2
awk '
/\/\// {slash_count++} # Count lines with `//`
slash_count >= 9528 {print} ' Neutral_Souilmi_Q1_9.txt | head
sed -i 's|0//|0\n//|g' Neutral_Souilmi_Q1_9.txt
for i in {41..50}; do
echo $i
grep "segsites" Neutral_Souilmi_Q1_${i}.txt | wc -l
done