Skip to content

Commit a5a1f7f

Browse files
jwcodeelcoombe
andauthored
Log info and seed design improvement (#131)
* Update location of error message * Correct spacing * Use different shared mem location * Make tmp dir * Try newer python * Test pinning dependency * Move pinned version * goldrush_path: expand reads skipped log * goldrush_path: change variable name typo * goldrush: change prefix * spaced_seed: deterministic random space seed * goldrush: only use seed preset if k and w are default * goldrush: remove tab indent * Update goldpolish to latest commit * Remove debugging statements in azure-pipelines.yml and goldrush_test_demo.sh * Update prefix in help docs * goldrush: reverse seed preset logic regarding default k and w * goldrush: remove s from help * readme: update help to reflect goldrush * Update p in goldrush help --------- Co-authored-by: lcoombe <lauren.e.coombe@gmail.com>
1 parent 12ab32b commit a5a1f7f

File tree

7 files changed

+47
-20
lines changed

7 files changed

+47
-20
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,12 @@ default 5 rounds)
6363
o occupancy of the miBF [0.1]
6464
x threshold for number of hits in miBF for a given frame to be considered assigned [10]
6565
h number of seed patterns to use [3]
66-
s spaced seed design [1011011110110111101101]
6766
m minimum read length [20000]
6867
M maximum number of silver paths to generate [5]
6968
r ratio of full genome in golden path [0.9]
7069
P minimum average phred score for each read [15]
7170
d remove reads with greater or equal than d difference between average phred quality of first half and second half of the read [5]
72-
p prefix to use for the output paths [w16_x10]
71+
p prefix to use for the output paths [goldrush_asm]
7372
7473
Tigmint-long options:
7574
span min number of spanning molecules [2]

azure-pipelines.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ jobs:
1414
displayName: Create Anaconda environment
1515
- script: |
1616
source activate goldrush_CI
17-
conda install --yes -c conda-forge mamba python=3.9
18-
mamba install --yes -c conda-forge -c bioconda compilers meson gperftools sdsl-lite boost-cpp sparsehash btllib libdivsufsort minimap2 tigmint ntlink miller
17+
conda install --yes -c conda-forge mamba python=3.10
18+
mamba install --yes -c conda-forge -c bioconda compilers meson gperftools sdsl-lite boost-cpp sparsehash btllib libdivsufsort minimap2 tigmint ntlink miller
1919
displayName: Install dependencies
2020
- script: |
2121
source activate goldrush_CI
@@ -47,8 +47,8 @@ jobs:
4747
displayName: Create Anaconda environment
4848
- script: |
4949
source activate goldrush_CI
50-
conda install --yes -c conda-forge mamba python=3.9
51-
mamba install --yes -c conda-forge -c bioconda compilers meson gperftools sdsl-lite boost-cpp sparsehash btllib libdivsufsort minimap2 tigmint ntlink miller
50+
conda install --yes -c conda-forge mamba python=3.10
51+
mamba install --yes -c conda-forge -c bioconda compilers meson gperftools sdsl-lite boost-cpp sparsehash btllib libdivsufsort minimap2 tigmint ntlink miller
5252
displayName: Install dependencies
5353
- script: |
5454
source activate goldrush_CI

bin/goldrush

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ h=3
6868
s=1011011110110111101101
6969
r=0.9
7070
M=5
71-
p=w$(w)_x$(x)
71+
p=goldrush_asm
7272
p1=$(p)_silver_path
7373
p2=$(p)_golden_path
7474
P=15
@@ -165,13 +165,12 @@ help:
165165
@echo " o occupancy of the miBF [$(o)]"
166166
@echo " x threshold for number of hits in miBF for a given frame to be considered assigned [$(x)]"
167167
@echo " h number of seed patterns to use [$(h)]"
168-
@echo " s spaced seed design [$(s)]"
169168
@echo " m minimum read length [$(m)]"
170169
@echo " M maximum number of silver paths to generate [$(M)]"
171170
@echo " r ratio of full genome in golden path [$(r)]"
172171
@echo " P minimum average phred score for each read [$(P)]"
173172
@echo " d remove reads with greater or equal than d difference between average phred quality of first half and second half of the read [$(d)]"
174-
@echo " p prefix to use for the output paths [w$(w)_x$(x)]"
173+
@echo " p prefix to use for the output paths [$(p)]"
175174
@echo ""
176175
@echo " Tigmint-long options:"
177176
@echo " span min number of spanning molecules [$(span)]"
@@ -231,14 +230,26 @@ endif
231230
goldrush-path: $(p2).fa check-G check-reads clean
232231

233232
$(p2).fa: $(p1)_all.fq
234-
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p2) -i $< -h $(h) -j $(t) -P $(P) -d $(d) -x$(x) -s $(s) -g $(G) -b $(b) -m 0
233+
ifneq ($(k), 22)
234+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p2) -i $< -h $(h) -j $(t) -P $(P) -d $(d) -x$(x) -g $(G) -b $(b) -m 0
235+
else ifneq ($(w), 16)
236+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p2) -i $< -h $(h) -j $(t) -P $(P) -d $(d) -x$(x) -g $(G) -b $(b) -m 0
237+
else
238+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p2) -i $< -h $(h) -j $(t) -P $(P) -d $(d) -x$(x) -s $(s) -g $(G) -b $(b) -m 0
239+
endif
235240
echo "Done GoldRush-Path! Golden path can be found in: $@"
236241

237242
$(p1)_all.fq: $(p1)_$(M).fq
238243
cat $(p1)_*.fq > $@
239244

240245
$(p1)_$(M).fq: $(long_reads)
241-
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p1) -i $< -h $(h) -j $(t) -x$(x) -P $(P) -d $(d) -s $(s) -g $(G) -b $(b) -r $(r) --silver_path -M $(M) -m $(m)
246+
ifneq ($(k), 22)
247+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p1) -i $< -h $(h) -j $(t) -x$(x) -P $(P) -d $(d) -g $(G) -b $(b) -r $(r) --silver_path -M $(M) -m $(m)
248+
else ifneq ($(w), 16)
249+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p1) -i $< -h $(h) -j $(t) -x$(x) -P $(P) -d $(d) -g $(G) -b $(b) -r $(r) --silver_path -M $(M) -m $(m)
250+
else
251+
$(time) goldrush-path -k $(k) -w $(w) -t $(tile) -u $(u) -a $(a) -o $(o) -p $(p1) -i $< -h $(h) -j $(t) -x$(x) -P $(P) -d $(d) -s $(s) -g $(G) -b $(b) -r $(r) --silver_path -M $(M) -m $(m)
252+
endif
242253

243254
%.racon-polished.fa: %.fa.$(long_reads).sam %.fa
244255
$(time) racon -u -t$(t) $(long_reads) $^ > $@

goldrush_path/goldrush_path.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,16 @@ fill_bit_vector(const std::string& input_file,
140140
}
141141
size_t num_reads = 0;
142142
size_t num_passed_reads = 0;
143-
size_t size_num_reads_skipped_by_phred = 0;
144-
size_t size_num_reads_skipped_by_delta = 0;
143+
size_t num_reads_skipped_by_phred = 0;
144+
size_t num_reads_skipped_by_delta = 0;
145+
size_t num_reads_skipped_by_length = 0;
145146
#pragma omp parallel
146147
for (const auto record : reader) {
147148
#pragma omp atomic
148149
++num_reads;
149150
if (record.seq.size() < min_seq_len) {
151+
#pragma omp atomic
152+
++num_reads_skipped_by_length;
150153
continue;
151154
}
152155
const auto phred_stat = calc_phred_average(record.qual);
@@ -155,11 +158,11 @@ fill_bit_vector(const std::string& input_file,
155158
if (opt::verbose) {
156159
if (phred_stat.first < opt::phred_min) {
157160
#pragma omp atomic
158-
++size_num_reads_skipped_by_phred;
161+
++num_reads_skipped_by_phred;
159162
}
160163
if (phred_stat.second >= opt::phred_delta) {
161164
#pragma omp atomic
162-
++size_num_reads_skipped_by_delta;
165+
++num_reads_skipped_by_delta;
163166
}
164167
}
165168
#pragma omp critical
@@ -183,9 +186,15 @@ fill_bit_vector(const std::string& input_file,
183186
<< floor((double)(num_reads - num_passed_reads) / num_reads)
184187
<< "\n"
185188
<< "num_reads_skipped_by_phred: "
186-
<< size_num_reads_skipped_by_phred << "\n"
189+
<< num_reads_skipped_by_phred << "\n"
187190
<< "num_reads_skipped_by_delta: "
188-
<< size_num_reads_skipped_by_delta << "\n"
191+
<< num_reads_skipped_by_delta << "\n"
192+
<< "num_reads_skipped_by_length: "
193+
<< num_reads_skipped_by_length << "\n"
194+
<< "Total reads skipped: "
195+
<< num_reads_skipped_by_phred +
196+
num_reads_skipped_by_delta +
197+
num_reads_skipped_by_length
189198
<< std::endl;
190199
}
191200

@@ -1083,6 +1092,14 @@ main(int argc, char** argv)
10831092
opt::min_length,
10841093
filter_out_reads);
10851094
}
1095+
1096+
}
1097+
if (opt::silver_path && opt::max_paths > curr_path) {
1098+
std::cerr << "WARNING: Expected " << std::to_string(opt::max_paths)
1099+
<< " silver paths, but only " << std::to_string(curr_path)
1100+
<<" generated.\n" << "Possible reasons include:\n"
1101+
<< "\t- Input reads sorted by chromosome/position\n"
1102+
<< "\t- Genome size set too large\n";
10861103
}
10871104

10881105
std::cerr << "assigned" << std::endl;

goldrush_path/spaced_seeds.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ make_seed_pattern(const std::string& seed_preset,
1616
std::string right_seed_str;
1717

1818
if (seed_preset == "") {
19-
srand(time(NULL));
19+
srand(123);
2020
// seed generation
2121
std::cerr << "Designing base symmetrical spaced seed"
2222
<< "\n"

tests/goldrush_test_demo.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ curl -L --output test_reads.fq https://www.bcgsc.ca/downloads/btl/goldrush/test/
77

88
# Run this demo to test your GoldRush installation
99
echo "Launching GoldRush"
10-
goldrush run reads=test_reads G=1e6 t=4 p=goldrush_test -B
10+
goldrush run reads=test_reads G=1e6 t=4 p=goldrush_test -B
1111

1212
l50=$(abyss-fac goldrush_test_golden_path.goldpolish-polished.span2.dist500.tigmint.fa.k40.w250.ntLink-5rounds.fa |awk '{print $3}' |tail -n1)
1313

0 commit comments

Comments
 (0)