@@ -28,21 +28,21 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
2828 // This vectors has the following dimensions: [256][num of ith char][num_docs]
2929 // This structure stores the DA profiles for each
3030 // character separately.
31- std::vector<std::vector<std::vector<size_t >>> start_doc_profiles;
32- std::vector<std::vector<std::vector<size_t >>> end_doc_profiles;
31+ std::vector<std::vector<std::vector<uint16_t >>> start_doc_profiles;
32+ std::vector<std::vector<std::vector<uint16_t >>> end_doc_profiles;
3333
3434 // These vectors are just for conveinence of testing
3535 // and visualizing. They are the document array
3636 // profiles but sequentially from top to bottom of BWT
37- std::vector<std::vector<size_t >> start_doc_profiles_seq;
38- std::vector<std::vector<size_t >> end_doc_profiles_seq;
37+ std::vector<std::vector<uint16_t >> start_doc_profiles_seq;
38+ std::vector<std::vector<uint16_t >> end_doc_profiles_seq;
3939
4040 typedef size_t size_type;
4141
4242 doc_queries (std::string filename, std::string output_path=" " , size_t num_profiles=0 , bool rle = true ):
4343 ri::r_index<sparse_bv_type, rle_string_t>(),
44- start_doc_profiles(256 , std::vector<std::vector<size_t >>(0 , std::vector<size_t >(0 ))),
45- end_doc_profiles(256 , std::vector<std::vector<size_t >>(0 , std::vector<size_t >(0 )))
44+ start_doc_profiles(256 , std::vector<std::vector<uint16_t >>(0 , std::vector<uint16_t >(0 ))),
45+ end_doc_profiles(256 , std::vector<std::vector<uint16_t >>(0 , std::vector<uint16_t >(0 )))
4646 {
4747 std::string bwt_fname = filename + " .bwt" ;
4848
@@ -108,16 +108,16 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
108108 STATUS_LOG (" build_profiles" , " loading the document array profiles" );
109109 start = std::chrono::system_clock::now ();
110110
111- start_doc_profiles_seq.resize (this ->r , std::vector<size_t >(num_docs, 0 ));
112- end_doc_profiles_seq.resize (this ->r , std::vector<size_t >(num_docs, 0 ));
113-
114- read_doc_profiles (start_doc_profiles, filename + " .sdap" , this ->num_docs , this ->r , start_doc_profiles_seq);
115- read_doc_profiles (end_doc_profiles, filename + " .edap" , this ->num_docs , this ->r , end_doc_profiles_seq);
111+ // If we are trying to print out profiles, then we will actually use these vectors
112+ if (output_path.size ()) {
113+ start_doc_profiles_seq.resize (this ->r , std::vector<uint16_t >(num_docs, 0 ));
114+ end_doc_profiles_seq.resize (this ->r , std::vector<uint16_t >(num_docs, 0 ));
115+ }
116+ read_doc_profiles (start_doc_profiles, filename + " .sdap" , this ->num_docs , this ->r , start_doc_profiles_seq, output_path.size ());
117+ read_doc_profiles (end_doc_profiles, filename + " .edap" , this ->num_docs , this ->r , end_doc_profiles_seq, output_path.size ());
116118
117119 DONE_LOG ((std::chrono::system_clock::now () - start));
118120
119- // end_doc_profiles_two.resize(this->r, std::vector<size_t>(num_docs, 0));
120-
121121 // If the user wants to print out document array profiles ...
122122 if (output_path.size ()) {
123123 FORCE_LOG (" build_profiles" , " number of documents: d = %ld" , this ->num_docs );
@@ -140,7 +140,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
140140 }
141141 }
142142
143- static void read_doc_profiles (std::vector<std::vector<std::vector<size_t >>>& prof_matrix, std::string input_file, size_t num_docs, size_t num_runs, std::vector<std::vector<size_t >>& doc_profiles_seq) {
143+ static void read_doc_profiles (std::vector<std::vector<std::vector<uint16_t >>>& prof_matrix, std::string input_file, size_t num_docs,
144+ size_t num_runs, std::vector<std::vector<uint16_t >>& doc_profiles_seq, bool load_sequentially) {
144145 /* loads a set of document array profiles into their respective matrix */
145146
146147 // First, lets open the file and verify the size/# of docs are valid
@@ -169,12 +170,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
169170 if (fread (&curr_bwt_ch, 1 , 1 , fd) != 1 )
170171 FATAL_ERROR (" issue occurred while reading in bwt character from doc profiles file." );
171172
172- std::vector<size_t > curr_profile (num_docs, 0 );
173+ std::vector<uint16_t > curr_profile (num_docs, 0 );
173174 for (size_t j = 0 ; j < num_docs; j++) {
174175 if ((fread (&curr_val, DOCWIDTH, 1 , fd)) != 1 )
175176 error (" fread() file " + input_file + " failed" );
176177 curr_profile[j] = curr_val;
177- doc_profiles_seq[i][j] = curr_val;
178+ if (load_sequentially)
179+ doc_profiles_seq[i][j] = curr_val;
178180 }
179181 prof_matrix[curr_bwt_ch].push_back (curr_profile);
180182 }
@@ -194,21 +196,30 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
194196 seq = kseq_init (fp);
195197
196198 // lambda to print out the document listing
197- auto process_profile = [&](std::vector<size_t > profile, size_t length) {
198- std::vector<size_t > docs_found;
199+ auto process_profile = [&](std::vector<uint16_t > profile, uint16_t length) {
199200 std::string output_str = " {" ;
200201 bool found_one = false ;
201202
202203 for (size_t i = 0 ; i < profile.size (); i++) {
204+ // std::cout << "length = " << length << " profile_val = " << profile[i] << std::endl;
203205 if (profile[i] >= length) {
204206 output_str += " ," + std::to_string (i);
205207 found_one = true ;
206208 }
207209 }
208210 if (found_one)
209211 output_str = " {" + output_str.substr (2 ) + " } " ;
210- else
212+ else {
211213 output_str = " {} " ;
214+ // std::cout << "length = " << length << std::endl;
215+ // for (size_t i = 0; i < profile.size();i++)
216+ // std::cout << profile[i] << " ";
217+ // std::cout << "\n";
218+ // std::exit(1);
219+ }
220+ // for (size_t i = 0; i < profile.size();i++)
221+ // std::cout << profile[i] << " ";
222+ // std::cout << "\n";
212223 listings_fd << output_str;
213224 };
214225
@@ -221,7 +232,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
221232 }
222233
223234 size_t start = 0 , end = this ->bwt .size (), end_pos_of_match = seq->seq .l -1 ;
224- std::vector<size_t > curr_profile (this ->num_docs , 0 );
235+ std::vector<uint16_t > curr_profile (this ->num_docs , 0 );
236+ uint16_t length = 0 ;
225237
226238 listings_fd << " >" << seq->name .s << " \n " ;
227239
@@ -232,11 +244,15 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
232244 // Tell us what type of profile to grab based on pointer variables
233245 bool use_start = false , use_end = false ;
234246
247+ // std::cout << "\n";
248+
235249 // Perform backward search and report document listings when
236250 // range goes empty or we reach the end
237251 for (int i = (seq->seq .l -1 ); i >= 0 ; i--) {
238252 uint8_t next_ch = seq->seq .s [i];
239253
254+ // std::cout << "start = " << start << " end = " << end << std::endl;
255+
240256 size_t num_ch_before_start = this ->bwt .rank (start, next_ch);
241257 size_t num_ch_before_end = this ->bwt .rank (end, next_ch);
242258 size_t start_run = this ->bwt .run_of_position (start);
@@ -253,10 +269,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
253269 curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
254270 else
255271 curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
256- std::for_each (curr_profile.begin (), curr_profile.end (), [&](size_t &x){x+=num_LF_steps;});
272+ std::for_each (curr_profile.begin (), curr_profile.end (), [&](uint16_t &x){x+=num_LF_steps;});
257273
258274 listings_fd << " [" << (i+1 ) << " ," << end_pos_of_match << " ] " ;
259- process_profile (curr_profile, (end_pos_of_match-i));
275+ // std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
276+
277+ length = std::min ((size_t ) MAXLCPVALUE, (end_pos_of_match-i));
278+ process_profile (curr_profile, length);
260279 end_pos_of_match = i;
261280
262281 start = 0 ; end = this ->bwt .size ();
@@ -271,6 +290,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
271290 num_LF_steps = 0 ;
272291 use_start = false ; use_end = false ;
273292
293+ // std::cout << "case 1: next_ch = " << next_ch << std::endl;
294+
274295 // If the start position run is the same as query
275296 // ch, then we can guarantee that end of run is in the range
276297 // otherwise, we can guarantee the start of run is in range.
@@ -287,10 +308,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
287308 curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
288309 else
289310 curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
290- std::for_each (curr_profile.begin (), curr_profile.end (), [&](size_t &x){x+=num_LF_steps;});
311+ std::for_each (curr_profile.begin (), curr_profile.end (), [&](uint16_t &x){x+=num_LF_steps;});
291312
292313 listings_fd << " [" << (i+1 ) << " ," << end_pos_of_match << " ] " ;
293- process_profile (curr_profile, (end_pos_of_match-i));
314+ // std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
315+
316+ length = std::min ((size_t ) MAXLCPVALUE, (end_pos_of_match-i));
317+ process_profile (curr_profile, length);
294318 end_pos_of_match = i;
295319
296320 start = 0 ; end = this ->bwt .size ();
@@ -304,6 +328,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
304328 num_LF_steps = 0 ;
305329 use_start = false ; use_end = false ;
306330
331+ // std::cout << "case 2" << std::endl;
332+
307333 // If the start position run is the same as query
308334 // ch, then we can guarantee that end of run is in the range
309335 // otherwise, we can guarantee the start of run is in range.
@@ -316,6 +342,7 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
316342 else
317343 {
318344 num_LF_steps++;
345+ // std::cout << "case 3" << std::endl;
319346 // std::transform(curr_profile.begin(), curr_profile.end(), curr_profile.begin(),
320347 // [](size_t x) { return (++x); });
321348 }
@@ -329,10 +356,14 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
329356 curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
330357 else
331358 curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
332- std::for_each (curr_profile.begin (), curr_profile.end (), [&](size_t &x){x+=num_LF_steps;});
359+ std::for_each (curr_profile.begin (), curr_profile.end (), [&](uint16_t &x){x+=num_LF_steps;});
333360
334361 listings_fd << " [" << 0 << " ," << end_pos_of_match << " ] " ;
335- process_profile (curr_profile, end_pos_of_match+1 );
362+ // std::cout << "[" << 0 << "," << end_pos_of_match << "] " << std::endl;
363+
364+ // std::cout << num_LF_steps << std::endl;
365+ length = std::min ((size_t ) MAXLCPVALUE, end_pos_of_match+1 );
366+ process_profile (curr_profile, length);
336367 listings_fd << " \n " ;
337368 }
338369 }
@@ -402,10 +433,10 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
402433 size_t curr_run_num = 0 ;
403434 std::vector<size_t > ch_pos (256 , 0 );
404435
405- std::vector<size_t > curr_start_profile (num_docs, 0 );
406- std::vector<size_t > curr_end_profile (num_docs, 0 );
436+ std::vector<uint16_t > curr_start_profile (num_docs, 0 );
437+ std::vector<uint16_t > curr_end_profile (num_docs, 0 );
407438
408- size_t max_poss_lcp = (read_length == 0 ) ? (std::pow (2 , DOCWIDTH*8 )-1 ) : read_length;
439+ uint16_t max_poss_lcp = (read_length == 0 ) ? (std::pow (2 , DOCWIDTH*8 )-1 ) : read_length;
409440
410441 for (size_t i = 0 ; i < n; i++){
411442 // Determine if we have started a new run
0 commit comments