Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit 2d185bc

Browse files
Merge branch 'master' into tbb
2 parents 5b24765 + 3b497b8 commit 2d185bc

25 files changed

+2818
-671
lines changed

src/brain/cluster.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// cluster.cpp
6+
//
7+
// Identification: src/brain/cluster.cpp
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "brain/cluster.h"
14+
#include "common/logger.h"
15+
#include "common/macros.h"
16+
17+
namespace peloton {
18+
namespace brain {
19+
20+
void Cluster::AddTemplateAndUpdateCentroid(std::string &fingerprint,
21+
std::vector<double> &feature) {
22+
auto num_templates = templates_.size();
23+
for (unsigned int i = 0u; i < feature.size(); i++) {
24+
centroid_[i] +=
25+
(centroid_[i] * num_templates + feature[i]) * 1.0 / (num_templates + 1);
26+
}
27+
templates_.insert(fingerprint);
28+
}
29+
30+
void Cluster::AddTemplate(const std::string &fingerprint) {
31+
templates_.insert(fingerprint);
32+
}
33+
34+
void Cluster::RemoveTemplate(std::string &fingerprint) {
35+
templates_.erase(fingerprint);
36+
}
37+
38+
void Cluster::UpdateCentroid(
39+
std::map<std::string, std::vector<double>> &features) {
40+
int num_features = centroid_.size();
41+
std::fill(centroid_.begin(), centroid_.end(), 0);
42+
PL_ASSERT(templates_.size() != 0);
43+
44+
for (auto fingerprint : templates_) {
45+
auto feature = features[fingerprint];
46+
for (int i = 0; i < num_features; i++) {
47+
centroid_[i] += feature[i];
48+
}
49+
}
50+
51+
for (int i = 0; i < num_features; i++) {
52+
centroid_[i] /= (templates_.size());
53+
}
54+
}
55+
56+
double Cluster::CosineSimilarity(std::vector<double> &feature) {
57+
double dot = 0.0, denom_a = 0.0, denom_b = 0.0;
58+
double epsilon = 1e-5;
59+
for (unsigned int i = 0u; i < feature.size(); i++) {
60+
dot += centroid_[i] * feature[i];
61+
denom_a += centroid_[i] * centroid_[i];
62+
denom_b += feature[i] * feature[i];
63+
}
64+
65+
if (denom_a < epsilon || denom_b < epsilon) return 0.0;
66+
67+
return dot / (sqrt(denom_a) * sqrt(denom_b));
68+
}
69+
70+
} // namespace brain
71+
} // namespace peloton

src/brain/kd_tree.cpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// kd_tree.cpp
6+
//
7+
// Identification: src/brain/kd_tree.cpp
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "brain/kd_tree.h"
14+
#include "common/logger.h"
15+
16+
namespace peloton {
17+
namespace brain {
18+
19+
void KDTree::Insert(Cluster *cluster) {
20+
// TODO[Siva]: Currently we ubuild and the build tree again for every change
21+
// to the structure. Will need to change AnnoyIndex to optimize this
22+
index_.unbuild();
23+
index_.add_item(size_, cluster->GetCentroid().data());
24+
index_.build(2 * num_features_);
25+
cluster->SetIndex(size_);
26+
clusters_.push_back(cluster);
27+
size_++;
28+
}
29+
30+
// TODO[Siva]: cluster is unused as the index is rebuilt using the centroids
31+
// of all the existing clusters already existing in the clusters_
32+
void KDTree::Update(UNUSED_ATTRIBUTE Cluster *cluster) {
33+
// TODO[Siva]: Currently we ubuild and the build tree again for every change
34+
// to the structure. Will need to change AnnoyIndex to optimize this
35+
// The update to the centroid is reflected in the cluster. There is no change
36+
// to the clusters_, so just rebuild the entire index
37+
index_.reinitialize();
38+
Build();
39+
}
40+
41+
void KDTree::GetNN(std::vector<double> &feature, Cluster *&cluster,
42+
double &similarity) {
43+
if (size_ == 0) {
44+
cluster = nullptr;
45+
similarity = 0.0;
46+
return;
47+
}
48+
49+
std::vector<int> closest;
50+
std::vector<double> distances;
51+
index_.get_nns_by_vector(feature.data(), 1, (size_t)-1, &closest, &distances);
52+
cluster = clusters_[closest[0]];
53+
// convert the angular distance to corresponsing cosine similarity
54+
similarity = (2.0 - distances[0]) / 2.0;
55+
}
56+
57+
void KDTree::Build() {
58+
for (int i = 0; i < size_; i++) {
59+
index_.add_item(i, clusters_[i]->GetCentroid().data());
60+
}
61+
// number of random forests built by the AnnoyIndex = 2 * num_features
62+
// the more the faster, but requires more memory
63+
index_.build(2 * num_features_);
64+
}
65+
66+
void KDTree::Build(std::set<Cluster *> &clusters) {
67+
index_.reinitialize();
68+
clusters_.clear();
69+
for (auto &cluster : clusters) {
70+
clusters_.push_back(cluster);
71+
}
72+
size_ = clusters_.size();
73+
Build();
74+
}
75+
76+
} // namespace brain
77+
} // namespace peloton

src/brain/query_clusterer.cpp

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// query_clusterer.cpp
6+
//
7+
// Identification: src/brain/query_clusterer.cpp
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "brain/query_clusterer.h"
14+
#include "common/logger.h"
15+
16+
namespace peloton {
17+
namespace brain {
18+
19+
void QueryClusterer::UpdateFeatures() {
20+
// Read the latest queries from server over RPC or the brainside catalog
21+
// Update the feature vectors for template queries and l2 - normalize them
22+
// For new templates - insert into templates_ and
23+
// call UpdateTemplate(fingerprint, true)
24+
}
25+
26+
void QueryClusterer::UpdateTemplate(std::string fingerprint, bool is_new) {
27+
// Find the nearest cluster of the template's feature vector by querying the
28+
// KDTree of the centroids of the clusters. If the similarity of the feature
29+
// with the cluster is greater than the threshold, add it to the cluster.
30+
// Otherwise create a new cluster with this template
31+
auto feature = features_[fingerprint];
32+
double similarity = 0.0;
33+
Cluster *cluster = nullptr;
34+
35+
kd_tree_.GetNN(feature, cluster, similarity);
36+
37+
if (cluster == nullptr) {
38+
// If the kd_tree_ is empty
39+
cluster = new Cluster(num_features_);
40+
cluster->AddTemplateAndUpdateCentroid(fingerprint, feature);
41+
kd_tree_.Insert(cluster);
42+
clusters_.insert(cluster);
43+
template_cluster_[fingerprint] = cluster;
44+
return;
45+
}
46+
47+
if (similarity > threshold_) {
48+
// If the nearest neighbor has a similarity higher than the threshold_
49+
if (is_new) {
50+
cluster->AddTemplateAndUpdateCentroid(fingerprint, feature);
51+
kd_tree_.Update(cluster);
52+
} else {
53+
// updating an existing template, so need not update the centroid
54+
cluster->AddTemplate(fingerprint);
55+
}
56+
} else {
57+
// create a new cluster as the nearest neighbor is not similar enough
58+
cluster = new Cluster(num_features_);
59+
cluster->AddTemplateAndUpdateCentroid(fingerprint, feature);
60+
kd_tree_.Insert(cluster);
61+
clusters_.insert(cluster);
62+
}
63+
64+
template_cluster_[fingerprint] = cluster;
65+
}
66+
67+
void QueryClusterer::UpdateExistingTemplates() {
68+
// for each template check the similarity with the cluster
69+
// if the similarity is less than the threshold, then remove it
70+
// and insert into the next nearest cluster
71+
// Update the centroids at the end of the round only
72+
for (auto &feature : features_) {
73+
auto fingerprint = feature.first;
74+
auto *cluster = template_cluster_[fingerprint];
75+
auto similarity = cluster->CosineSimilarity(feature.second);
76+
if (similarity < threshold_) {
77+
cluster->RemoveTemplate(fingerprint);
78+
UpdateTemplate(fingerprint, false);
79+
}
80+
}
81+
82+
std::vector<Cluster *> to_delete;
83+
for (auto &cluster : clusters_) {
84+
if (cluster->GetSize() == 0) {
85+
to_delete.push_back(cluster);
86+
} else {
87+
cluster->UpdateCentroid(features_);
88+
}
89+
}
90+
91+
// Delete the clusters that are empty
92+
for (auto cluster : to_delete) {
93+
clusters_.erase(cluster);
94+
delete cluster;
95+
}
96+
97+
// Rebuild the tree to account for the deleted clusters
98+
kd_tree_.Build(clusters_);
99+
}
100+
101+
void QueryClusterer::MergeClusters() {
102+
// Merge two clusters that are within the threshold in similarity
103+
// Iterate from left to right and merge the left one into right one and mark
104+
// the left one for deletion
105+
std::vector<Cluster *> to_delete;
106+
for (auto i = clusters_.begin(); i != clusters_.end(); i++) {
107+
for (auto j = i; ++j != clusters_.end();) {
108+
auto left = *i;
109+
auto right = *j;
110+
auto r_centroid = right->GetCentroid();
111+
auto similarity = left->CosineSimilarity(r_centroid);
112+
113+
if (similarity > threshold_) {
114+
auto templates = left->GetTemplates();
115+
for (auto &fingerprint : templates) {
116+
right->AddTemplate(fingerprint);
117+
template_cluster_[fingerprint] = right;
118+
}
119+
right->UpdateCentroid(features_);
120+
to_delete.push_back(left);
121+
break;
122+
}
123+
}
124+
}
125+
126+
// Delete the clusters that are empty
127+
for (auto cluster : to_delete) {
128+
clusters_.erase(cluster);
129+
delete cluster;
130+
}
131+
132+
// Rebuild the KDTree to account for changed clusters
133+
kd_tree_.Build(clusters_);
134+
}
135+
136+
void QueryClusterer::UpdateCluster() {
137+
// This function needs to be scheduled periodically for updating the clusters
138+
// Update the feature vectors of all templates, update new and existing
139+
// templates and merge the clusters
140+
UpdateFeatures();
141+
UpdateExistingTemplates();
142+
MergeClusters();
143+
}
144+
145+
void QueryClusterer::AddFeature(std::string &fingerprint,
146+
std::vector<double> feature) {
147+
// Normalize and add a feature into the cluster.
148+
// This is currently used only for testing.
149+
double l2_norm = 0.0;
150+
for (uint i = 0; i < feature.size(); i++) l2_norm += feature[i] * feature[i];
151+
152+
if (l2_norm > 0.0)
153+
for (uint i = 0; i < feature.size(); i++) feature[i] /= l2_norm;
154+
155+
if (features_.find(fingerprint) == features_.end()) {
156+
// Update the cluster if it's a new template
157+
features_[fingerprint] = feature;
158+
UpdateTemplate(fingerprint, true);
159+
} else {
160+
features_[fingerprint] = feature;
161+
}
162+
}
163+
164+
QueryClusterer::~QueryClusterer() {
165+
for (auto &cluster : clusters_) delete cluster;
166+
}
167+
168+
} // namespace brain
169+
} // namespace peloton

0 commit comments

Comments
 (0)