Skip to content

Commit a009752

Browse files
committed
Added calculating mean versus nearest neighbor search speed test.
Forgot to add TSVQ header eariler.
1 parent 996cd6b commit a009752

File tree

2 files changed

+249
-15
lines changed

2 files changed

+249
-15
lines changed

src/EMTree.cpp

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,9 @@ void sigEMTreeCluster(vector<SVector<bool>*> &vectors) {
8787
typedef KMeans<vecType, seederType, distanceType, protoType> clustererType;
8888

8989
// EMTree
90-
int depth = 5;
90+
int depth = 3;
9191
int iters = 2;
92-
vector<int> nodeSizes = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
92+
vector<int> nodeSizes = {100};//{10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
9393
for (int m : nodeSizes) {
9494
std::cout << "-------------------" << std::endl;
9595
EMTree<vecType, clustererType, distanceType, protoType> emt(m, depth);
@@ -343,28 +343,63 @@ void reduceDims(const set<int>& topbits, vector<SVector<bool>*>& vectors,
343343
}
344344
}
345345

346-
int main(int argc, char** argv) {
347-
vector<SVector<bool>*> vectors;
348-
int veccount = -1;
349-
loadWikiSignatures(vectors, veccount);
350-
vector<SVector<bool>*> subset;
351-
346+
void testHistogram(vector<SVector<bool>*>& vectors) {
352347
if (!vectors.empty()) {
348+
vector<SVector<bool>*> subset;
353349
int dims = vectors[0]->size();
354-
set<int> topbits = dimensionHistogram(vectors, dims);
350+
set<int> topbits;
351+
{
352+
boost::timer::auto_cpu_timer seed("calculating histogram: %w seconds\n");
353+
topbits = dimensionHistogram(vectors, dims);
354+
}
355355
loadSubset(vectors, subset, "/Users/chris/LMW-tree/data/inex_xml_mining_subset_2010.txt");
356356
cout << "filtered " << subset.size() << " vectors to create a subet" << endl;
357357
vector<SVector<bool>*> reducedSubset;
358358
cout << "reducing dimensionality to " << topbits.size() << endl;
359359
reduceDims(topbits, subset, reducedSubset);
360-
//sigKTreeCluster(vectors);
361-
//sigTSVQCluster(subset);
362-
//sigEMTreeCluster(vectors);
363360
sigKmeansCluster(subset, "/Users/chris/LMW-tree/data/fulldim_clusters.txt");
364361
sigKmeansCluster(reducedSubset, "/Users/chris/LMW-tree/data/reduceddim_clusters.txt");
365-
//testReadVectors();
366-
//TestSigEMTree();
367362
}
368-
return 0;
363+
}
364+
365+
void testMeanVersusNNSpeed(vector<SVector<bool>*>& vectors) {
366+
if (!vectors.empty()) {
367+
const int dims = vectors[0]->size();
368+
SVector<bool> mean(dims);
369+
vector<int> weights;
370+
{
371+
boost::timer::auto_cpu_timer time("calculating mean: %w seconds\n");
372+
meanBitPrototype2 proto;
373+
proto(&mean, vectors, weights);
374+
}
375+
{
376+
boost::timer::auto_cpu_timer hammingTime("hamming distance: %w seconds\n");
377+
hammingDistance distance;
378+
uint64_t sum = 0;
379+
for (auto vector : vectors) {
380+
sum += distance(&mean, vector);
381+
}
382+
cout << sum << endl;
383+
}
384+
}
385+
}
386+
387+
int main(int argc, char** argv) {
388+
vector < SVector<bool>*> vectors;
389+
//int veccount = -1;
390+
int veccount = 100000;
391+
{
392+
boost::timer::auto_cpu_timer seed("loading signatures: %w seconds\n");
393+
loadWikiSignatures(vectors, veccount);
394+
}
395+
396+
//sigKTreeCluster(vectors);
397+
//sigTSVQCluster(vectors);
398+
sigEMTreeCluster(vectors);
399+
//testHistogram(vectors);
400+
//testMeanVersusNNSpeed(vectors);
401+
//testReadVectors();
402+
//TestSigEMTree();
403+
return EXIT_SUCCESS;
369404
}
370405

src/TSVQ.h

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
#ifndef TSVQ_H
2+
#define TSVQ_H
3+
4+
#include "StdIncludes.h"
5+
6+
#include "Node.h"
7+
#include "threadpool.hpp"
8+
#include "threadpool/size_policies.hpp"
9+
10+
using namespace boost::threadpool;
11+
12+
template <typename T, typename ClustererType, typename DistanceType, typename ProtoType>
13+
class TSVQ {
14+
private:
15+
// The order of this tree
16+
int _m;
17+
18+
// The order of this tree
19+
int _depth;
20+
21+
// The root of the tree.
22+
Node<T> *_root;
23+
24+
ClustererType _clusterer;
25+
DistanceType _distF;
26+
ProtoType _protoF;
27+
28+
public:
29+
TSVQ(int order, int depth, int maxiters) : _m(order), _depth(depth),
30+
_root(new Node<T>()) {
31+
_clusterer.setNumClusters(_m);
32+
_clusterer.setMaxIters(maxiters);
33+
}
34+
35+
int getClusterCount() {
36+
return clusterCount(_root);
37+
}
38+
39+
int getObjCount() {
40+
return objCount(_root);
41+
}
42+
43+
int getLevelCount() {
44+
return levelCount(_root);
45+
}
46+
47+
int getMaxLevelCount() {
48+
return maxLevelCount(_root);
49+
}
50+
51+
int getMinLevelCount() {
52+
return minLevelCount(_root);
53+
}
54+
55+
void printStats() {
56+
std::cout << "\nNumber of objects: " << getObjCount();
57+
std::cout << "\nCluster count: " << getClusterCount();
58+
std::cout << "\nLevel count (node 0): " << getLevelCount();
59+
std::cout << "\nMax depth: " << getMaxLevelCount();
60+
std::cout << "\nRMSE: " << getRMSE();
61+
}
62+
63+
void cluster(vector<T*> &data) {
64+
// make the root a leaf containing all data
65+
_root->addAll(data);
66+
cluster(_root, _depth);
67+
}
68+
69+
void cluster(Node<T>* current, int depth) {
70+
if (depth == 1) {
71+
return;
72+
} else {
73+
vector<Cluster<T>*> clusters = _clusterer.cluster(current->getKeys());
74+
current->clearKeysAndChildren();
75+
for (Cluster<T>* c : clusters) {
76+
Node<T>* child = new Node<T>();
77+
child->addAll(c->getNearestList());
78+
current->add(c->getCentroid(), child);
79+
}
80+
for (Node<T>* n : current->getChildren()) {
81+
cluster(n, depth - 1);
82+
}
83+
}
84+
}
85+
86+
double getRMSE() {
87+
return RMSE();
88+
}
89+
90+
91+
private:
92+
93+
double RMSE() {
94+
double RMSE = sumSquaredError(NULL, _root);
95+
int size = getObjCount();
96+
RMSE /= size;
97+
RMSE = sqrt(RMSE);
98+
return RMSE;
99+
}
100+
101+
double sumSquaredError(T* parentKey, Node<T> *child) {
102+
103+
double distance = 0.0;
104+
double dis;
105+
106+
if (child->isLeaf()) {
107+
vector<T*> &keys = child->getKeys();
108+
for (T* key : keys) {
109+
dis = _distF(key, parentKey);
110+
distance += dis * dis;
111+
}
112+
} else {
113+
int numEntries = child->size();
114+
115+
vector<T*> &keys = child->getKeys();
116+
vector<Node<T>*> &children = child->getChildren();
117+
118+
for (int i = 0; i < numEntries; i++) {
119+
distance += sumSquaredError(keys[i], children[i]);
120+
}
121+
}
122+
123+
return distance;
124+
}
125+
126+
int objCount(Node<T>* current) {
127+
128+
if (current->isLeaf()) {
129+
return current->size();
130+
} else {
131+
int localCount = 0;
132+
vector<Node<T>*>& children = current->getChildren();
133+
for (Node<T> *child : children) {
134+
localCount += objCount(child);
135+
136+
}
137+
return localCount;
138+
}
139+
}
140+
141+
int clusterCount(Node<T>* current) {
142+
if (current->isLeaf()) {
143+
return 1;
144+
} else {
145+
int localCount = 0;
146+
vector<Node<T>*>& children = current->getChildren();
147+
for (Node<T> *child : children) {
148+
localCount += clusterCount(child);
149+
}
150+
return localCount;
151+
}
152+
}
153+
154+
int levelCount(Node<T>* current) {
155+
if (current->isLeaf()) {
156+
return 1;
157+
} else {
158+
return levelCount(current->getChild(0)) + 1;
159+
}
160+
}
161+
162+
int maxLevelCount(Node<T>* current) {
163+
if (current->isLeaf()) {
164+
return 1;
165+
}
166+
else {
167+
int count = 0;
168+
int maxCount = 0;
169+
vector<Node<T>*>& children = current->getChildren();
170+
for (Node<T> *child : children) {
171+
count = maxLevelCount(child);
172+
if (count > maxCount) maxCount = count;
173+
}
174+
return maxCount + 1;
175+
}
176+
}
177+
178+
int minLevelCount(Node<T>* current) {
179+
if (current->isLeaf()) {
180+
return 1;
181+
}
182+
else {
183+
int count = 0;
184+
int minCount = 0;
185+
vector<Node<T>*>& children = current->getChildren();
186+
for (Node<T> *child : children) {
187+
count = maxLevelCount(child);
188+
if (count < minCount) minCount = count;
189+
}
190+
return minCount + 1;
191+
}
192+
}
193+
194+
195+
};
196+
197+
198+
#endif /* TSVQ_H */
199+

0 commit comments

Comments
 (0)