DANNsweeps/Process_simulations_documentation.txt at main · garudlab/DANNsweeps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#######################################################
################ simulation processing ################
#######################################################

1) Add missing data to simulations

   #go to aDNA_processData/ directory

   # set correct __path__  and __new_filename__ directory in qsub_Process_aDNA script and then run script

   qsub qsub_Process_aDNA

   # do this for all the simulations you want to process (hard, soft neutral, different demographies etc.)


1.5) Make sure you have correct number of segsites:

for file in *; do
    sed -i 's/segsites: [0-9]\+/segsites: 201/g' "$file"
done

#remove empty lines
for file in *; do
    sed -i '/^$/d' "$file"
    #grep -c '^$' $file #count empty lines
    #echo $file
done


2) Rename each file in format Model_THETA_target.txt


for i in {1..50}; do
    mv MD_Neutral_Souilmi_Q1_${i}.txt NeutralSouilmiMD05_THETA0.0_target0_${i}.txt
    #mv  MD_SoftSweep_Souilmi_Q1_${i}.txt  SoftSweepSouilmiTHETA5SelecLow_THETA5.0_target0_${i}.txt
    #mv  MD_HardSweep_Souilmi_Q1_${i}.txt  HardSweepSouilmiSelecLow_THETA0.01_target0_${i}.txt
    #mv MD_SGV_ConstantNe_Q1_${i}.txt  SGVMD43_THETA0.01_target0_${i}.txt
done

for i in {1..50}; do
    mv ConstantNeMD43_Neutral_THETA1to5_n150_w201_centerWindowDist_trainsrc_${i}.dat ConstantNeMD43_Neutral_THETA1to5_n150_w201_centerWindowFreq_trainsrc_${i}.dat
done


#check num of sims per file
for file in *; do
    grep "segsites" "$file" | wc -l
done


2.5) (Only run this if I have to merge data from files and re-organize) Merge data

counter=1
# Loop through files in groups of 5
for i in $(seq 1 5 100); do
    # Concatenate 5 files at a time
    echo $counter
    cat SoftSweep_1e6_H0.5_THETAvary_Q50_$i.txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+1)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+2)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+3)).txt SoftSweep_1e6_H0.5_THETAvary_Q50_$((i+4)).txt > SoftSweep_1e6_H0.5_THETAvary_Q50_merged_$counter.txt
    # Increment the counter
    ((counter++))
done


3) Once missing data has been added, process simulations and order haplotypes
    # make sure your path directory doesn't include directories with _ (underscore) in file name


    # go to directory /DANN/aDNA/ProcessingData
    # Processed files will be saved in /DANN/aDNA/ProcessingData/ProcessedData -- this directory includes two sub directories:
    # /DANN/aDNA/ProcessingData/ProcessedData/SortByRowDist/
    # /DANN/aDNA/ProcessingData/ProcessedData/SortByRowFreq/
    # Save the data oan the corresponding directory dependig on how we sort haplotypes -- each subdirectory containd directories for Hard Sweeps, Soft Sweeps , Neutral , etc

    qsub_processData


4) Concatenate desired number of simulations into a .npy file to use as training input

    qsub qsub_DataForTraining

    # this runs the script data_for_training.py --> in this script you must define the directory of the already processed and sorted simulations as well as the name og the output file.npy.
    # The code will generate three output files ex:
    #      neutral_ConstantNe_RowDist_n150_w201_sims.npy, HS_ConstantNe_RowDist_n150_w201_sims.npy and SS_ConstantNe_RowDist_n150_w201_sims.npy


#-----------------------------------------------------------------------------------------------------------------------------------------

qsub_GenerateData # for aDNA data
# qsub_GenerateData runs GenerateTrainingData.py

qsub_GenerateData_modernMD # for modern human data or Drosophila data
## qsub_GenerateData_modernMD runs GenerateTestData_Drosophila.py or GenerateTrainingData_modern.py


#get position data from simulations

for i in {1..50}; do

    grep "positions:" ../SoftSweep_1e4_THETAvary_Q1_450Kb_${i}.txt > POS_SoftSweep_1e4_THETAvary_Q1_450Kb_${i}.txt

done


for i in {1..50}; do
    mv SouilmiMiss1iMD05_Neutral_THETA1to5_n150_w201_centerWindowDist_trainsrc_${i}.dat SouilmiMiss1iMD05_Neutral_THETA1to5_n150_w201_centerWindowFreq_trainsrc_${i}.dat
done


# get num origins and theta

awk '{if ($0 ~ /REACHED PF -- outputting tree/) print prev2; prev2=prev; prev=$0}' joblog.8127162.1 > num_origins_sample.txt_1

awk '{if ($0 ~ /REACHED PF -- outputting tree/) print prev; prev=$0}' joblog.8127162.1 > num_origins.txt_1


awk '/RECHED END OF SIMULATION/{found=1; next} found {print; found=0}' joblog.8042099.50 > num_origins.txt_2

sed -i '/SIMULATION END REACHED NO SWEEP -- RESTARTING/d' num_origins.txt_2

cat num_origins.txt_1 num_origins.txt_2 > num_origins.txt


find PopGenStats/ -type f ! -exec test -e /u/project/ngarud/Garud_lab/DANN/aDNA/ProcessingData/ProcessedData/PopGenStats/{} \; -exec cp {} /u/project/ngarud/Garud_lab/DANN/aDNA/ProcessingData/ProcessedData/PopGenStats/ \;

awk '
/\/\// {slash_count++}  # Count lines with `//`
slash_count >= 9000 {print}  # Print all lines after the nth occurrence
' Neutral_Souilmi_Q1_20.txt > Neutral_Souilmi_Q1_20.txt_pt2

awk '
/\/\// {slash_count++}  # Count lines with `//`
slash_count >= 9528 {print} ' Neutral_Souilmi_Q1_9.txt | head


sed -i 's|0//|0\n//|g' Neutral_Souilmi_Q1_9.txt


for i in {41..50}; do
    echo $i
    grep "segsites" Neutral_Souilmi_Q1_${i}.txt | wc -l
done