Skip to content

Commit 680614e

Browse files
committed
fixed a couple bugs described in CHANGLOG; bumped version
1 parent 64f06d9 commit 680614e

File tree

4 files changed

+96
-42
lines changed

4 files changed

+96
-42
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# PFP-Doc Change Log
22

3-
## v1.0.3 - latest
3+
## v1.0.4 - latest
4+
- Turned off second method for lcp queue trimming to avoid errors
5+
- Fixed the non-heuristic method for lcp queue trimming by updating the ch_doc_counter table
6+
during the loop
7+
- Updated the code that updates profile with LF steps to avoid overflow
8+
9+
## v1.0.3
410
- Updated the doc_queries constructor to avoid creating a separate table of sequential entries
511
to limit the RAM usage, especially when working with large datasets.
612

include/doc_queries.hpp

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
8383
size_t log_n = bitsize(uint64_t(this->bwt.size()));
8484

8585
FORCE_LOG("build_profiles", "bwt statistics: n = %ld, r = %ld\n" , this->bwt.size(), this->r);
86+
87+
88+
// for (size_t i = 0; i < n; i++)
89+
// std::cout << "i = " << i << " bwt[i] = " << this->bwt[i] << std::endl;
8690

8791
// Determine the number of documents and verify the that file
8892
// sizes are correct.
@@ -230,27 +234,21 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
230234
found_one = true;
231235
}
232236
}
237+
233238
if (found_one)
234239
output_str = "{" + output_str.substr(2) + "} ";
235240
else {
236241
output_str = "{} ";
237-
//std::cout << "length = " << length << std::endl;
238-
// for (size_t i = 0; i < profile.size();i++)
239-
// std::cout << profile[i] << " ";
240-
// std::cout << "\n";
241-
//std::exit(1);
242-
}
243-
// for (size_t i = 0; i < profile.size();i++)
244-
// std::cout << profile[i] << " ";
245-
// std::cout << "\n";
242+
}
246243
listings_fd << output_str;
247244
};
248245

249246
// Process each read, and print out the document lists
250247
while (kseq_read(seq)>=0) {
251248

252249
// Uppercase every character in read
253-
for (size_t i = 0; i < seq->seq.l; ++i) {
250+
for (size_t i = 0; i < seq->seq.l; ++i)
251+
{
254252
seq->seq.s[i] = static_cast<char>(std::toupper(seq->seq.s[i]));
255253
}
256254

@@ -267,8 +265,6 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
267265
// Tell us what type of profile to grab based on pointer variables
268266
bool use_start = false, use_end = false;
269267

270-
//std::cout << "\n";
271-
272268
// Perform backward search and report document listings when
273269
// range goes empty or we reach the end
274270
for (int i = (seq->seq.l-1); i >= 0; i--) {
@@ -292,10 +288,9 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
292288
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
293289
else
294290
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
295-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
291+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x = std::min((size_t) MAXLCPVALUE, x+num_LF_steps);});
296292

297293
listings_fd << "[" << (i+1) << "," << end_pos_of_match << "] ";
298-
//std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
299294

300295
length = std::min((size_t) MAXLCPVALUE, (end_pos_of_match-i));
301296
process_profile(curr_profile, length);
@@ -313,6 +308,7 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
313308
num_LF_steps = 0;
314309
use_start = false; use_end = false;
315310

311+
// DEBUG
316312
//std::cout << "case 1: next_ch = " << next_ch << std::endl;
317313

318314
// If the start position run is the same as query
@@ -322,6 +318,20 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
322318
use_end = true;
323319
else
324320
use_start = true;
321+
322+
323+
// DEBUG:
324+
// if (use_end) {
325+
// for (auto x: end_doc_profiles[curr_prof_ch][curr_prof_pos])
326+
// std::cout << x << " ";
327+
// std::cout << "\n";
328+
// }
329+
// else {
330+
// for (auto x: start_doc_profiles[curr_prof_ch][curr_prof_pos])
331+
// std::cout << x << " ";
332+
// std::cout << "\n";
333+
// }
334+
325335
}
326336
// range is within BWT run, but wrong character
327337
else if (this->bwt[start] != next_ch)
@@ -331,10 +341,9 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
331341
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
332342
else
333343
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
334-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
344+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x = std::min((size_t) MAXLCPVALUE, x+num_LF_steps);});
335345

336346
listings_fd << "[" << (i+1) << "," << end_pos_of_match << "] ";
337-
//std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
338347

339348
length = std::min((size_t) MAXLCPVALUE, (end_pos_of_match-i));
340349
process_profile(curr_profile, length);
@@ -365,9 +374,7 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
365374
else
366375
{
367376
num_LF_steps++;
368-
//std::cout << "case 3" << std::endl;
369-
//std::transform(curr_profile.begin(), curr_profile.end(), curr_profile.begin(),
370-
// [](size_t x) { return (++x); });
377+
//std::cout << "case 3" << std::endl;
371378
}
372379

373380
// Perform an LF step
@@ -379,12 +386,35 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
379386
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
380387
else
381388
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
382-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
383389

384-
listings_fd << "[" << 0 << "," << end_pos_of_match << "] ";
385-
//std::cout << "[" << 0 << "," << end_pos_of_match << "] " << std::endl;
390+
// DEBUG:
391+
// if (use_end) {
392+
// for (auto x: curr_profile)
393+
// std::cout << x << " ";
394+
// std::cout << "\n";
395+
// }
396+
// else {
397+
// for (auto x: curr_profile)
398+
// std::cout << x << " ";
399+
// std::cout << "\n";
400+
// }
401+
402+
// Update profile based on LF steps
403+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x = std::min((size_t) MAXLCPVALUE, x+num_LF_steps);});
404+
405+
// DEBUG:
406+
// if (use_end) {
407+
// for (auto x: curr_profile)
408+
// std::cout << x << " ";
409+
// std::cout << "\n";
410+
// }
411+
// else {
412+
// for (auto x: curr_profile)
413+
// std::cout << x << " ";
414+
// std::cout << "\n";
415+
// }
386416

387-
//std::cout << num_LF_steps << std::endl;
417+
listings_fd << "[" << 0 << "," << end_pos_of_match << "] ";
388418
length = std::min((size_t) MAXLCPVALUE, end_pos_of_match+1);
389419
process_profile(curr_profile, length);
390420
listings_fd << "\n";

include/pfp_doc.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
std::fprintf(stderr, "\n");} while (0)
2929

3030
// Defintions
31-
#define PFPDOC_VERSION "1.0.3"
31+
#define PFPDOC_VERSION "1.0.4"
3232

3333
#define DOCWIDTH 2 // 5
3434
#define MAXQUEUELENGTH 1000000

include/pfp_lcp_doc.hpp

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -504,14 +504,17 @@ class pfp_lcp{
504504
std::vector<size_t> curr_da_profile (ref_build->num_docs, 0);
505505
std::vector<bool> docs_to_collect (ref_build->num_docs, false);
506506

507-
// DEBUGGING
507+
// DEBUG: variables defined for debugging
508508
std::vector<double> times = {0, 0, 0, 0, 0, 0};
509509
std::vector<size_t> method_wins = {0, 0, 0};
510510
double avg_queue_length = 0.0;
511511
size_t total_pos_traversed = 0;
512512
auto start = std::chrono::system_clock::now();
513513
auto sec = std::chrono::duration<double>(std::chrono::system_clock::now() - start).count();
514514

515+
// DEBUG: output file for storing variables
516+
//std::ofstream debugging_fd (filename + ".output_data.txt");
517+
515518
// Create a backup predecessor max lcp table, and re-initialize with max_lcp
516519
size_t num_blocks_of_32 = num_docs/32;
517520
num_blocks_of_32++;
@@ -591,9 +594,7 @@ class pfp_lcp{
591594
ssa = (pf.pos_T[*curr_occ.first] - curr.suffix_length) % (pf.n - pf.w + 1ULL);
592595
esa = (pf.pos_T[*curr_occ.first] - curr.suffix_length) % (pf.n - pf.w + 1ULL);
593596

594-
595597
/* Start of the DA Profiles code */
596-
597598
uint8_t curr_bwt_ch = curr_occ.second.second;
598599
size_t lcp_i = lcp_suffix;
599600
size_t sa_i = ssa;
@@ -612,6 +613,9 @@ class pfp_lcp{
612613
size_t pos_of_LF_i = (sa_i > 0) ? (sa_i - 1) : (ref_build->total_length-1);
613614
size_t doc_of_LF_i = ref_build->doc_ends_rank(pos_of_LF_i);
614615

616+
// DEBUG: creates the debugging file
617+
// debugging_fd << pos << std::setw(20) << (curr_run_num-1) << std::setw(20) << curr_bwt_ch << std::setw(20) << sa_i << std::setw(20) << doc_of_LF_i << std::setw(20) << lcp_i << std::endl;
618+
615619
// Add the current suffix data to LCP queue
616620
queue_entry_t curr_entry = {curr_run_num-1, curr_bwt_ch, doc_of_LF_i, is_start, is_end, lcp_i};
617621
lcp_queue.push_back(curr_entry);
@@ -644,7 +648,7 @@ class pfp_lcp{
644648

645649
// Update the predecessor max lcp structure with the current lcp
646650
// so basiscally iterate through all values and take the min
647-
// FYI: this is one of the time-consuming part of the construction
651+
// IMPORTANT: this is the old code before SIMDifying.
648652

649653
// for (size_t ch_num = 0; ch_num < 256; ch_num++) {
650654
// for (size_t doc_num = 0; doc_num < num_docs; doc_num++) {
@@ -706,7 +710,7 @@ class pfp_lcp{
706710

707711
/* End of SIMD changes */
708712

709-
// START of check !!!!
713+
// CHECK CODE: makes sure the predecessor table is correct
710714

711715
// for (size_t i = 0; i < 256; i++) {
712716
// for (size_t j = 0; j < num_docs; j++) {
@@ -722,8 +726,7 @@ class pfp_lcp{
722726
// }
723727
// std::cout << "we passed the test!" << std::endl;
724728

725-
726-
// END of check !!!!
729+
// END OF CHECK
727730

728731

729732
// Initialize the curr_da_profile with max lcp for predecessor
@@ -817,36 +820,51 @@ class pfp_lcp{
817820
// Method #2: heuristic since we remove all entries above a small lcp value (7)
818821
size_t curr_pos = 0;
819822
size_t records_to_remove_method1 = 0, records_to_remove_method2 = 0;
820-
bool method1_done = false, method2_done = false;
821-
if (pos % 10 == 0) {
823+
bool method1_done = false, method2_done = true; // Turned off Method #2 for now
824+
//if (pos % 10 == 0) {
822825
while (curr_pos < lcp_queue.size() && (!method1_done || !method2_done)) {
823826
uint8_t curr_ch = lcp_queue[curr_pos].bwt_ch;
824827
size_t curr_doc = lcp_queue[curr_pos].doc_num;
825828
assert(ch_doc_counters[curr_ch][curr_doc] >= 1);
826829

830+
size_t current_run_num = lcp_queue[curr_pos].run_num;
831+
827832
if (!method1_done) {
833+
828834
size_t count = 0;
829835
for (size_t i = 0; i < num_docs; i++) {
830836
if (i != curr_doc && ch_doc_counters[curr_ch][i] >= 1)
831837
count++;
832838
}
833-
834-
if (count == (num_docs-1))
839+
/*
840+
* IMPORTANT: I added the decrement statements below since we need
841+
* an updated ch_doc_counters in order to make correct decisions. The
842+
* problem was that sometimes it would remove too many entries
843+
*/
844+
if (count == (num_docs-1)) {
835845
records_to_remove_method1++;
846+
ch_doc_counters[curr_ch][curr_doc] -= 1;
836847
// TODO: generalize this to take into account characters that only occur once
837-
else if (curr_ch == EndOfDict || (curr_ch != 'A' && curr_ch != 'C'
838-
&& curr_ch != 'G' && curr_ch != 'T' && curr_ch != 'U'))
848+
} else if (curr_ch == EndOfDict || (curr_ch != 'A' && curr_ch != 'C'
849+
&& curr_ch != 'G' && curr_ch != 'T' && curr_ch != 'U')) {
839850
records_to_remove_method1++;
840-
else
851+
ch_doc_counters[curr_ch][curr_doc] -= 1;
852+
} else
841853
method1_done = true;
842854
}
843855
if (!method2_done) {
844-
if (lcp_queue[records_to_remove_method2++].lcp_with_prev_suffix <= 5)
856+
if (lcp_queue[records_to_remove_method2++].lcp_with_prev_suffix <= 8)
845857
method2_done = true;
846858
}
847859
curr_pos++;
848860
}
849-
}
861+
// }
862+
863+
// Added this check to avoid removing all entries when there are no small
864+
// values in the queue. TODO: fix this so the loop above understands this
865+
// case
866+
if (records_to_remove_method2 == lcp_queue.size())
867+
records_to_remove_method2 = 0;
850868

851869
// Take the maximum value from the two methods above, BUT
852870
// we cannot reduce the queue to empty because we need to
@@ -1326,7 +1344,7 @@ class pfp_lcp{
13261344
bool is_end = lcp_queue[0].is_end;
13271345

13281346
// Update <ch, doc> count matrix
1329-
ch_doc_counters[curr_ch][curr_doc] -= 1;
1347+
//ch_doc_counters[curr_ch][curr_doc] -= 1;
13301348

13311349
// Update the queue position lists
13321350
num_records_ejected++;

0 commit comments

Comments
 (0)