Skip to content

Commit dbc3007

Browse files
Merge pull request #4 from lemire/master
Various minor fixes
2 parents 992c083 + a4c05a2 commit dbc3007

File tree

1 file changed

+47
-28
lines changed

1 file changed

+47
-28
lines changed

benchmarks/bulk-insert-and-query.cc

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ using namespace gqfilter;
5757
#endif
5858

5959
// The number of items sampled when determining the lookup performance
60-
const size_t SAMPLE_SIZE = 10 * 1000 * 1000;
60+
const size_t MAX_SAMPLE_SIZE = 10 * 1000 * 1000;
6161

6262
// The statistics gathered for each table type:
6363
struct Statistics {
@@ -509,9 +509,13 @@ Statistics FilterBenchmark(
509509

510510
#ifdef __linux__
511511
unified.end(results);
512-
printf("cycles: %10.zu (%10.3f per key) instructions: %10.zu (%10.3f per key, %10.3f per cycle) cache misses: %10.zu (%10.3f per key) branch misses: %10.zu (%10.3f per key)\n",
513-
(size_t)results[0], results[0]*1.0/add_count, (size_t)results[1], results[1]*1.0/add_count , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/add_count,
514-
(size_t)results[3], results[3] * 1.0/add_count);
512+
printf("adds ");
513+
printf("cycles: %4.1f/key, instructions: (%4.1f/key, %4.1f/cycle) cache misses: %4.2f/key branch misses: %4.2f/key\n",
514+
results[0]*1.0/add_count,
515+
results[1]*1.0/add_count ,
516+
results[1]*1.0/results[0],
517+
results[2]*1.0/add_count,
518+
results[3] * 1.0/add_count);
515519
#else
516520
std::cout << "." << std::flush;
517521
#endif
@@ -539,9 +543,13 @@ Statistics FilterBenchmark(
539543
const auto lookup_time = NowNanos() - start_time;
540544
#ifdef __linux__
541545
unified.end(results);
542-
printf("cycles: %10.zu (%10.3f per key) instructions: %10.zu (%10.3f per key, %10.3f per cycle) cache misses: %10.zu (%10.3f per key) branch misses: %10.zu (%10.3f per key)\n",
543-
(size_t)results[0], results[0]*1.0/to_lookup_mixed.size(), (size_t)results[1], results[1]*1.0/to_lookup_mixed.size() , results[1]*1.0/results[0], (size_t)results[2], results[2]*1.0/to_lookup_mixed.size(),
544-
(size_t)results[3], results[3] * 1.0/to_lookup_mixed.size());
546+
printf("%3.2f%% ",found_probability);
547+
printf("cycles: %4.1f/key, instructions: (%4.1f/key, %4.1f/cycle) cache misses: %4.2f/key branch misses: %4.1f/key\n",
548+
results[0]*1.0/to_lookup_mixed.size(),
549+
results[1]*1.0/to_lookup_mixed.size(),
550+
results[1]*1.0/results[0],
551+
results[2]*1.0/to_lookup_mixed.size(),
552+
results[3] * 1.0/to_lookup_mixed.size());
545553
#else
546554
std::cout << "." << std::flush;
547555
#endif
@@ -652,18 +660,21 @@ int main(int argc, char * argv[]) {
652660
{5,"Cuckoo16"}, {6,"CuckooSemiSort13" }, {7,"Bloom8"},
653661
{8,"Bloom12" }, {9,"Bloom16"}, {10,"BlockedBloom"},
654662
{11,"sort"}, {12,"Xor+8"}, {13,"Xor+16"},
655-
{14,"GCS"}, {15,"CQF"}, {25, "Xor10"}, {37,"Bloom8 (addall)"},
663+
{14,"GCS"}, {15,"CQF"}, {22, "Xor10 (NBitArray)"}, {23, "Xor14 (NBitArray)"},
664+
{25, "Xor10"},{26, "Xor10.666"}, {37,"Bloom8 (addall)"},
656665
{38,"Bloom12 (addall)"},{39,"Bloom16 (addall)"},
657666
{40,"BlockedBloom (addall)"}
658667
};
668+
659669
if (argc < 2) {
660670
cout << "Usage: " << argv[0] << " <numberOfEntries> [<algorithmId> [<seed>]]" << endl;
661671
cout << " numberOfEntries: number of keys, we recommend at least 100000000" << endl;
662-
cout << " algorithmId: -1 for all (default), or 0..n to only run this algorithm" << endl;
672+
cout << " algorithmId: -1 for all default algos, or 0..n to only run this algorithm" << endl;
663673
cout << " algorithmId: can also be a comma-separated list of non-negative integers" << endl;
664674
for(auto i : names) {
665675
cout << " "<< i.first << " : " << i.second << endl;
666676
}
677+
cout << " algorithmId: can also be set to the string 'all' if you want to run them all, including some that are excluded by default" << endl;
667678
cout << " seed: seed for the PRNG; -1 for random seed (default)" << endl;
668679
return 1;
669680
}
@@ -674,14 +685,19 @@ int main(int argc, char * argv[]) {
674685
cerr << "Invalid number: " << argv[1];
675686
return 2;
676687
}
677-
int algorithmId = -1;
688+
int algorithmId = -1; // -1 is just the default
678689
std::set<int> algos;
679690
if (argc > 2) {
680-
if(strstr(argv[2],",") != NULL) {
691+
if(strcmp(argv[2],"all") == 0) {
692+
for(auto i : names) {// we add all the named algos.
693+
algos.insert(i.first);
694+
}
695+
} else if(strstr(argv[2],",") != NULL) {
681696
// we have a list of algos
682697
algorithmId = 9999999; // disabling
683698
parse_comma_separated(argv[2], algos);
684699
} else {
700+
// we select just one
685701
stringstream input_string_2(argv[2]);
686702
input_string_2 >> algorithmId;
687703
if (input_string_2.fail()) {
@@ -699,12 +715,16 @@ int main(int argc, char * argv[]) {
699715
return 2;
700716
}
701717
}
718+
size_t actual_sample_size = MAX_SAMPLE_SIZE;
719+
if (actual_sample_size > add_count) {
720+
actual_sample_size = add_count;
721+
}
702722
vector<uint64_t> to_add = seed == -1 ?
703723
GenerateRandom64Fast(add_count, rand()) :
704724
GenerateRandom64Fast(add_count, seed);
705725
vector<uint64_t> to_lookup = seed == -1 ?
706-
GenerateRandom64Fast(SAMPLE_SIZE, rand()) :
707-
GenerateRandom64Fast(SAMPLE_SIZE, seed + add_count);
726+
GenerateRandom64Fast(actual_sample_size, rand()) :
727+
GenerateRandom64Fast(actual_sample_size, seed + add_count);
708728

709729
if (seed >= 0 && seed < 64) {
710730
// 0-64 are special seeds
@@ -728,7 +748,7 @@ int main(int argc, char * argv[]) {
728748
}
729749
}
730750

731-
assert(to_lookup.size() == SAMPLE_SIZE);
751+
assert(to_lookup.size() == actual_sample_size);
732752
size_t distinct_lookup;
733753
size_t distinct_add;
734754
std::cout << "checking match size... " << std::flush;
@@ -749,14 +769,11 @@ int main(int argc, char * argv[]) {
749769
cout << "WARNING: Filter contains "<< (to_add.size() - distinct_add) << " duplicates." << endl;
750770
hasduplicates = true;
751771
}
752-
size_t actual_sample_size = SAMPLE_SIZE;
753-
if (actual_sample_size > add_count) {
754-
cout << "WARNING: Your set contains only " << add_count << ". We can't very well support a sample size of " << SAMPLE_SIZE << endl;
755-
actual_sample_size = add_count;
756-
}
772+
757773

758774
if (actual_sample_size > to_lookup.size()) {
759-
throw out_of_range("to_lookup must contain at least SAMPLE_SIZE values");
775+
std::cerr << "actual_sample_size = "<< actual_sample_size << std::endl;
776+
throw out_of_range("to_lookup must contain at least actual_sample_size values");
760777
}
761778

762779
std::vector<samples_t> mixed_sets;
@@ -952,42 +969,44 @@ int main(int argc, char * argv[]) {
952969
auto cf = FilterBenchmark<
953970
XorFilter2<uint64_t, uint16_t, NBitArray<uint16_t, 10>, SimpleMixSplit>>(
954971
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
955-
cout << setw(NAME_WIDTH) << "Xor10" << cf << endl;
972+
cout << setw(NAME_WIDTH) << names[22] << cf << endl;
956973
}
957974

975+
958976
if (algorithmId == 23 || (algos.find(23) != algos.end())) {
959977
auto cf = FilterBenchmark<
960978
XorFilter2<uint64_t, uint16_t, NBitArray<uint16_t, 14>, SimpleMixSplit>>(
961979
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
962-
cout << setw(NAME_WIDTH) << "Xor14" << cf << endl;
980+
cout << setw(NAME_WIDTH) << names[23] << cf << endl;
963981
}
964982

965-
if (algorithmId == 24 || (algos.find(24) != algos.end())) {
983+
// this algo overflows and crashes
984+
/*if (algorithmId == 24 || (algos.find(24) != algos.end())) {
966985
auto cf = FilterBenchmark<
967986
XorFilter2<uint64_t, uint32_t, UInt10Array, SimpleMixSplit>>(
968987
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
969-
cout << setw(NAME_WIDTH) << "Xor10.x" << cf << endl;
970-
}
988+
cout << setw(NAME_WIDTH) << names[24] << cf << endl;
989+
}*/
971990

972991
if (algorithmId == 25 || (algos.find(25) != algos.end())) {
973992
auto cf = FilterBenchmark<
974993
XorFilter10<uint64_t, SimpleMixSplit>>(
975994
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
976-
cout << setw(NAME_WIDTH) << "Xor10" << cf << endl;
995+
cout << setw(NAME_WIDTH) << names[25] << cf << endl;
977996
}
978997

979998
if (algorithmId == 26 || (algos.find(26) != algos.end())) {
980999
auto cf = FilterBenchmark<
9811000
XorFilter10_666<uint64_t, SimpleMixSplit>>(
9821001
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
983-
cout << setw(NAME_WIDTH) << "Xor10.666" << cf << endl;
1002+
cout << setw(NAME_WIDTH) << names[26] << cf << endl;
9841003
}
9851004

9861005
if (algorithmId == 27 || (algos.find(27) != algos.end())) {
9871006
auto cf = FilterBenchmark<
9881007
XorFilter13<uint64_t, SimpleMixSplit>>(
9891008
add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
990-
cout << setw(NAME_WIDTH) << "Xor13" << cf << endl;
1009+
cout << setw(NAME_WIDTH) << names[27] << cf << endl;
9911010
}
9921011

9931012
if (algorithmId == 37 || algorithmId < 0 || (algos.find(37) != algos.end())) {

0 commit comments

Comments
 (0)