Skip to content

Commit fc267d7

Browse files
committed
made various changes to code, check out v1.0.2 in log
1 parent 826edea commit fc267d7

File tree

5 files changed

+108
-56
lines changed

5 files changed

+108
-56
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
# PFP-Doc Change Log
22

3-
## v1.0.1 - latest
3+
## v1.0.2 - latest
4+
- Updated the construction code to write out lcp values in 16-bit registers opposed to using 8 bit registers. This
5+
now allows values to be up to 2^16-1, so any values larger than that are rounded down to 2^16-1.
6+
- Updated the query subcommand to load the profiles using the 16-bit registers.
7+
- Updated the SIMD code used for the predecessor table to use 16-bit integers, I had to use
8+
masked versions of various functions since they were not present in header.
9+
10+
## v1.0.1
411
- Added info subcommand for printing out a section of the document array profiles
512
- Updated gsacak repo to avoid error with compilation
613

include/doc_queries.hpp

Lines changed: 60 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,21 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
2828
// This vectors has the following dimensions: [256][num of ith char][num_docs]
2929
// This structure stores the DA profiles for each
3030
// character separately.
31-
std::vector<std::vector<std::vector<size_t>>> start_doc_profiles;
32-
std::vector<std::vector<std::vector<size_t>>> end_doc_profiles;
31+
std::vector<std::vector<std::vector<uint16_t>>> start_doc_profiles;
32+
std::vector<std::vector<std::vector<uint16_t>>> end_doc_profiles;
3333

3434
// These vectors are just for conveinence of testing
3535
// and visualizing. They are the document array
3636
// profiles but sequentially from top to bottom of BWT
37-
std::vector<std::vector<size_t>> start_doc_profiles_seq;
38-
std::vector<std::vector<size_t>> end_doc_profiles_seq;
37+
std::vector<std::vector<uint16_t>> start_doc_profiles_seq;
38+
std::vector<std::vector<uint16_t>> end_doc_profiles_seq;
3939

4040
typedef size_t size_type;
4141

4242
doc_queries(std::string filename, std::string output_path="", size_t num_profiles=0, bool rle = true):
4343
ri::r_index<sparse_bv_type, rle_string_t>(),
44-
start_doc_profiles(256, std::vector<std::vector<size_t>>(0, std::vector<size_t>(0))),
45-
end_doc_profiles(256, std::vector<std::vector<size_t>>(0, std::vector<size_t>(0)))
44+
start_doc_profiles(256, std::vector<std::vector<uint16_t>>(0, std::vector<uint16_t>(0))),
45+
end_doc_profiles(256, std::vector<std::vector<uint16_t>>(0, std::vector<uint16_t>(0)))
4646
{
4747
std::string bwt_fname = filename + ".bwt";
4848

@@ -108,16 +108,16 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
108108
STATUS_LOG("build_profiles", "loading the document array profiles");
109109
start = std::chrono::system_clock::now();
110110

111-
start_doc_profiles_seq.resize(this->r, std::vector<size_t>(num_docs, 0));
112-
end_doc_profiles_seq.resize(this->r, std::vector<size_t>(num_docs, 0));
113-
114-
read_doc_profiles(start_doc_profiles, filename + ".sdap", this->num_docs, this->r, start_doc_profiles_seq);
115-
read_doc_profiles(end_doc_profiles, filename + ".edap", this->num_docs, this->r, end_doc_profiles_seq);
111+
// If we are trying to print out profiles, then we will actually use these vectors
112+
if (output_path.size()) {
113+
start_doc_profiles_seq.resize(this->r, std::vector<uint16_t>(num_docs, 0));
114+
end_doc_profiles_seq.resize(this->r, std::vector<uint16_t>(num_docs, 0));
115+
}
116+
read_doc_profiles(start_doc_profiles, filename + ".sdap", this->num_docs, this->r, start_doc_profiles_seq, output_path.size());
117+
read_doc_profiles(end_doc_profiles, filename + ".edap", this->num_docs, this->r, end_doc_profiles_seq, output_path.size());
116118

117119
DONE_LOG((std::chrono::system_clock::now() - start));
118120

119-
//end_doc_profiles_two.resize(this->r, std::vector<size_t>(num_docs, 0));
120-
121121
// If the user wants to print out document array profiles ...
122122
if (output_path.size()) {
123123
FORCE_LOG("build_profiles", "number of documents: d = %ld" , this->num_docs);
@@ -140,7 +140,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
140140
}
141141
}
142142

143-
static void read_doc_profiles(std::vector<std::vector<std::vector<size_t>>>& prof_matrix, std::string input_file, size_t num_docs, size_t num_runs, std::vector<std::vector<size_t>>& doc_profiles_seq) {
143+
static void read_doc_profiles(std::vector<std::vector<std::vector<uint16_t>>>& prof_matrix, std::string input_file, size_t num_docs,
144+
size_t num_runs, std::vector<std::vector<uint16_t>>& doc_profiles_seq, bool load_sequentially) {
144145
/* loads a set of document array profiles into their respective matrix */
145146

146147
// First, lets open the file and verify the size/# of docs are valid
@@ -169,12 +170,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
169170
if (fread(&curr_bwt_ch, 1, 1, fd) != 1)
170171
FATAL_ERROR("issue occurred while reading in bwt character from doc profiles file.");
171172

172-
std::vector<size_t> curr_profile (num_docs, 0);
173+
std::vector<uint16_t> curr_profile (num_docs, 0);
173174
for (size_t j = 0; j < num_docs; j++) {
174175
if ((fread(&curr_val, DOCWIDTH, 1, fd)) != 1)
175176
error("fread() file " + input_file + " failed");
176177
curr_profile[j] = curr_val;
177-
doc_profiles_seq[i][j] = curr_val;
178+
if (load_sequentially)
179+
doc_profiles_seq[i][j] = curr_val;
178180
}
179181
prof_matrix[curr_bwt_ch].push_back(curr_profile);
180182
}
@@ -194,21 +196,30 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
194196
seq = kseq_init(fp);
195197

196198
// lambda to print out the document listing
197-
auto process_profile = [&](std::vector<size_t> profile, size_t length) {
198-
std::vector<size_t> docs_found;
199+
auto process_profile = [&](std::vector<uint16_t> profile, uint16_t length) {
199200
std::string output_str = "{";
200201
bool found_one = false;
201202

202203
for (size_t i = 0; i < profile.size(); i++) {
204+
//std::cout << "length = " << length << " profile_val = " << profile[i] << std::endl;
203205
if (profile[i] >= length) {
204206
output_str += "," + std::to_string(i);
205207
found_one = true;
206208
}
207209
}
208210
if (found_one)
209211
output_str = "{" + output_str.substr(2) + "} ";
210-
else
212+
else {
211213
output_str = "{} ";
214+
//std::cout << "length = " << length << std::endl;
215+
// for (size_t i = 0; i < profile.size();i++)
216+
// std::cout << profile[i] << " ";
217+
// std::cout << "\n";
218+
//std::exit(1);
219+
}
220+
// for (size_t i = 0; i < profile.size();i++)
221+
// std::cout << profile[i] << " ";
222+
// std::cout << "\n";
212223
listings_fd << output_str;
213224
};
214225

@@ -221,7 +232,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
221232
}
222233

223234
size_t start = 0, end = this->bwt.size(), end_pos_of_match = seq->seq.l-1;
224-
std::vector<size_t> curr_profile (this->num_docs, 0);
235+
std::vector<uint16_t> curr_profile (this->num_docs, 0);
236+
uint16_t length = 0;
225237

226238
listings_fd << ">" << seq->name.s << "\n";
227239

@@ -232,11 +244,15 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
232244
// Tell us what type of profile to grab based on pointer variables
233245
bool use_start = false, use_end = false;
234246

247+
//std::cout << "\n";
248+
235249
// Perform backward search and report document listings when
236250
// range goes empty or we reach the end
237251
for (int i = (seq->seq.l-1); i >= 0; i--) {
238252
uint8_t next_ch = seq->seq.s[i];
239253

254+
//std::cout << "start = " << start << " end = " << end << std::endl;
255+
240256
size_t num_ch_before_start = this->bwt.rank(start, next_ch);
241257
size_t num_ch_before_end = this->bwt.rank(end, next_ch);
242258
size_t start_run = this->bwt.run_of_position(start);
@@ -253,10 +269,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
253269
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
254270
else
255271
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
256-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](size_t &x){x+=num_LF_steps;});
272+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
257273

258274
listings_fd << "[" << (i+1) << "," << end_pos_of_match << "] ";
259-
process_profile(curr_profile, (end_pos_of_match-i));
275+
//std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
276+
277+
length = std::min((size_t) MAXLCPVALUE, (end_pos_of_match-i));
278+
process_profile(curr_profile, length);
260279
end_pos_of_match = i;
261280

262281
start = 0; end = this->bwt.size();
@@ -271,6 +290,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
271290
num_LF_steps = 0;
272291
use_start = false; use_end = false;
273292

293+
//std::cout << "case 1: next_ch = " << next_ch << std::endl;
294+
274295
// If the start position run is the same as query
275296
// ch, then we can guarantee that end of run is in the range
276297
// otherwise, we can guarantee the start of run is in range.
@@ -287,10 +308,13 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
287308
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
288309
else
289310
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
290-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](size_t &x){x+=num_LF_steps;});
311+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
291312

292313
listings_fd << "[" << (i+1) << "," << end_pos_of_match << "] ";
293-
process_profile(curr_profile, (end_pos_of_match-i));
314+
//std::cout << "[" << (i+1) << "," << end_pos_of_match << "] " << std::endl;
315+
316+
length = std::min((size_t) MAXLCPVALUE, (end_pos_of_match-i));
317+
process_profile(curr_profile, length);
294318
end_pos_of_match = i;
295319

296320
start = 0; end = this->bwt.size();
@@ -304,6 +328,8 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
304328
num_LF_steps = 0;
305329
use_start = false; use_end = false;
306330

331+
//std::cout << "case 2" << std::endl;
332+
307333
// If the start position run is the same as query
308334
// ch, then we can guarantee that end of run is in the range
309335
// otherwise, we can guarantee the start of run is in range.
@@ -316,6 +342,7 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
316342
else
317343
{
318344
num_LF_steps++;
345+
//std::cout << "case 3" << std::endl;
319346
//std::transform(curr_profile.begin(), curr_profile.end(), curr_profile.begin(),
320347
// [](size_t x) { return (++x); });
321348
}
@@ -329,10 +356,14 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
329356
curr_profile = end_doc_profiles[curr_prof_ch][curr_prof_pos];
330357
else
331358
curr_profile = start_doc_profiles[curr_prof_ch][curr_prof_pos];
332-
std::for_each(curr_profile.begin(), curr_profile.end(), [&](size_t &x){x+=num_LF_steps;});
359+
std::for_each(curr_profile.begin(), curr_profile.end(), [&](uint16_t &x){x+=num_LF_steps;});
333360

334361
listings_fd << "[" << 0 << "," << end_pos_of_match << "] ";
335-
process_profile(curr_profile, end_pos_of_match+1);
362+
//std::cout << "[" << 0 << "," << end_pos_of_match << "] " << std::endl;
363+
364+
//std::cout << num_LF_steps << std::endl;
365+
length = std::min((size_t) MAXLCPVALUE, end_pos_of_match+1);
366+
process_profile(curr_profile, length);
336367
listings_fd << "\n";
337368
}
338369
}
@@ -402,10 +433,10 @@ class doc_queries : ri::r_index<sparse_bv_type, rle_string_t>
402433
size_t curr_run_num = 0;
403434
std::vector<size_t> ch_pos (256, 0);
404435

405-
std::vector<size_t> curr_start_profile(num_docs, 0);
406-
std::vector<size_t> curr_end_profile(num_docs, 0);
436+
std::vector<uint16_t> curr_start_profile(num_docs, 0);
437+
std::vector<uint16_t> curr_end_profile(num_docs, 0);
407438

408-
size_t max_poss_lcp = (read_length == 0) ? (std::pow(2, DOCWIDTH*8)-1) : read_length;
439+
uint16_t max_poss_lcp = (read_length == 0) ? (std::pow(2, DOCWIDTH*8)-1) : read_length;
409440

410441
for (size_t i = 0; i < n; i++){
411442
// Determine if we have started a new run

include/pfp_doc.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
/* Useful MACROs */
1818
#define FATAL_ERROR(...) do {std::fprintf(stderr, "\nError: "); std::fprintf(stderr, __VA_ARGS__);\
1919
std::fprintf(stderr, "\n\n"); std::exit(1);} while(0)
20-
#define ASSERT(condition, msg) do {if (!condition){std::fprintf(stderr, "Assertion Failed: %s\n", msg); \
20+
#define ASSERT(condition, msg) do {if (!condition){std::fprintf(stderr, "\nAssertion Failed: %s\n", msg); \
2121
std::exit(1);}} while(0)
2222
#define STATUS_LOG(x, ...) do {std::fprintf(stderr, "[%s] ", x); std::fprintf(stderr, __VA_ARGS__ ); \
2323
std::fprintf(stderr, " ... ");} while(0)
@@ -28,13 +28,16 @@
2828
std::fprintf(stderr, "\n");} while (0)
2929

3030
// Defintions
31-
#define DOCWIDTH 1 // 5
31+
#define PFPDOC_VERSION "1.0.2"
32+
33+
#define DOCWIDTH 2 // 5
3234
#define MAXQUEUELENGTH 1000000
33-
#define PFPDOC_VERSION "1.0.1"
35+
#define MAXLCPVALUE 65535 // 2^16 - 1
3436

3537
#define AVX2_PRESENT __AVX2__
3638
#define AVX512BW_PRESENT __AVX512BW__
3739

40+
3841
/* Function declations */
3942
int pfpdoc_usage();
4043
int build_main(int argc, char** argv);

0 commit comments

Comments
 (0)