|
50 | 50 | #include "logger.h"
|
51 | 51 |
|
52 | 52 | #define BITS_PER_CHAR 8
|
| 53 | +#define BITS_PER_BASE 2 // for DNA/RNA sequences |
| 54 | +#define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE) |
| 55 | +#define HISTOS_PER_BASE (1<<BITS_PER_BASE) |
53 | 56 |
|
54 | 57 |
|
55 | 58 | namespace cvflann
|
@@ -825,6 +828,73 @@ class KMeansIndex : public NNIndex<Distance>
|
825 | 828 | }
|
826 | 829 |
|
827 | 830 |
|
| 831 | + void computeDnaNodeStatistics(KMeansNodePtr node, int* indices, |
| 832 | + unsigned int indices_length) |
| 833 | + { |
| 834 | + const unsigned int histos_veclen = static_cast<unsigned int>( |
| 835 | + veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR)); |
| 836 | + |
| 837 | + unsigned long long variance = 0ull; |
| 838 | + unsigned int* histograms = new unsigned int[histos_veclen]; |
| 839 | + memset(histograms, 0, sizeof(unsigned int)*histos_veclen); |
| 840 | + |
| 841 | + for (unsigned int i=0; i<indices_length; ++i) { |
| 842 | + variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>( |
| 843 | + distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_))); |
| 844 | + |
| 845 | + unsigned char* vec = (unsigned char*)dataset_[indices[i]]; |
| 846 | + for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { |
| 847 | + histograms[k + ((vec[l]) & 0x03)]++; |
| 848 | + histograms[k + 4 + ((vec[l]>>2) & 0x03)]++; |
| 849 | + histograms[k + 8 + ((vec[l]>>4) & 0x03)]++; |
| 850 | + histograms[k +12 + ((vec[l]>>6) & 0x03)]++; |
| 851 | + } |
| 852 | + } |
| 853 | + |
| 854 | + CentersType* mean = new CentersType[veclen_]; |
| 855 | + memoryCounter_ += int(veclen_*sizeof(CentersType)); |
| 856 | + unsigned char* char_mean = (unsigned char*)mean; |
| 857 | + unsigned int* h = histograms; |
| 858 | + for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { |
| 859 | + char_mean[l] = (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10 |
| 860 | + : h[k] > h[k+3] ? 0x00 : 0x11 |
| 861 | + : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10 |
| 862 | + : h[k+1] > h[k+3] ? 0x01 : 0x11) |
| 863 | + | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000 |
| 864 | + : h[k+4] > h[k+7] ? 0x00 : 0x1100 |
| 865 | + : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000 |
| 866 | + : h[k+5] > h[k+7] ? 0x0100 : 0x1100) |
| 867 | + | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000 |
| 868 | + : h[k+8] >h[k+11] ? 0x00 : 0x110000 |
| 869 | + : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000 |
| 870 | + : h[k+9] >h[k+11] ? 0x010000 : 0x110000) |
| 871 | + | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000 |
| 872 | + : h[k+12] >h[k+15] ? 0x00 : 0x11000000 |
| 873 | + : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000 |
| 874 | + : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000); |
| 875 | + } |
| 876 | + variance = static_cast<unsigned long long>( |
| 877 | + 0.5 + static_cast<double>(variance) / static_cast<double>(indices_length)); |
| 878 | + variance -= static_cast<unsigned long long>( |
| 879 | + ensureSquareDistance<Distance>( |
| 880 | + distance_(mean, ZeroIterator<ElementType>(), veclen_))); |
| 881 | + |
| 882 | + DistanceType radius = 0; |
| 883 | + for (unsigned int i=0; i<indices_length; ++i) { |
| 884 | + DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_); |
| 885 | + if (tmp>radius) { |
| 886 | + radius = tmp; |
| 887 | + } |
| 888 | + } |
| 889 | + |
| 890 | + node->variance = static_cast<DistanceType>(variance); |
| 891 | + node->radius = radius; |
| 892 | + node->pivot = mean; |
| 893 | + |
| 894 | + delete[] histograms; |
| 895 | + } |
| 896 | + |
| 897 | + |
828 | 898 | template<typename DistType>
|
829 | 899 | void computeNodeStatistics(KMeansNodePtr node, int* indices,
|
830 | 900 | unsigned int indices_length,
|
@@ -858,6 +928,22 @@ class KMeansIndex : public NNIndex<Distance>
|
858 | 928 | computeBitfieldNodeStatistics(node, indices, indices_length);
|
859 | 929 | }
|
860 | 930 |
|
| 931 | + void computeNodeStatistics(KMeansNodePtr node, int* indices, |
| 932 | + unsigned int indices_length, |
| 933 | + const cvflann::DNAmmingLUT* identifier) |
| 934 | + { |
| 935 | + (void)identifier; |
| 936 | + computeDnaNodeStatistics(node, indices, indices_length); |
| 937 | + } |
| 938 | + |
| 939 | + void computeNodeStatistics(KMeansNodePtr node, int* indices, |
| 940 | + unsigned int indices_length, |
| 941 | + const cvflann::DNAmming2<unsigned char>* identifier) |
| 942 | + { |
| 943 | + (void)identifier; |
| 944 | + computeDnaNodeStatistics(node, indices, indices_length); |
| 945 | + } |
| 946 | + |
861 | 947 |
|
862 | 948 | void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
|
863 | 949 | std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
|
@@ -1051,6 +1137,112 @@ class KMeansIndex : public NNIndex<Distance>
|
1051 | 1137 | }
|
1052 | 1138 |
|
1053 | 1139 |
|
| 1140 | + void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers, |
| 1141 | + std::vector<DistanceType>& radiuses, int* belongs_to, int* count) |
| 1142 | + { |
| 1143 | + for (int i=0; i<branching; ++i) { |
| 1144 | + centers[i] = new CentersType[veclen_]; |
| 1145 | + memoryCounter_ += (int)(veclen_*sizeof(CentersType)); |
| 1146 | + } |
| 1147 | + |
| 1148 | + const unsigned int histos_veclen = static_cast<unsigned int>( |
| 1149 | + veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR)); |
| 1150 | + cv::AutoBuffer<unsigned int> histos_buf(branching*histos_veclen); |
| 1151 | + Matrix<unsigned int> histos(histos_buf.data(), branching, histos_veclen); |
| 1152 | + |
| 1153 | + bool converged = false; |
| 1154 | + int iteration = 0; |
| 1155 | + while (!converged && iteration<iterations_) { |
| 1156 | + converged = true; |
| 1157 | + iteration++; |
| 1158 | + |
| 1159 | + // compute the new cluster centers |
| 1160 | + for (int i=0; i<branching; ++i) { |
| 1161 | + memset(histos[i],0,sizeof(unsigned int)*histos_veclen); |
| 1162 | + radiuses[i] = 0; |
| 1163 | + } |
| 1164 | + for (int i=0; i<indices_length; ++i) { |
| 1165 | + unsigned char* vec = (unsigned char*)dataset_[indices[i]]; |
| 1166 | + unsigned int* h = histos[belongs_to[i]]; |
| 1167 | + for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { |
| 1168 | + h[k + ((vec[l]) & 0x03)]++; |
| 1169 | + h[k + 4 + ((vec[l]>>2) & 0x03)]++; |
| 1170 | + h[k + 8 + ((vec[l]>>4) & 0x03)]++; |
| 1171 | + h[k +12 + ((vec[l]>>6) & 0x03)]++; |
| 1172 | + } |
| 1173 | + } |
| 1174 | + for (int i=0; i<branching; ++i) { |
| 1175 | + unsigned int* h = histos[i]; |
| 1176 | + unsigned char* charCenter = (unsigned char*)centers[i]; |
| 1177 | + for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { |
| 1178 | + charCenter[l]= (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10 |
| 1179 | + : h[k] > h[k+3] ? 0x00 : 0x11 |
| 1180 | + : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10 |
| 1181 | + : h[k+1] > h[k+3] ? 0x01 : 0x11) |
| 1182 | + | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000 |
| 1183 | + : h[k+4] > h[k+7] ? 0x00 : 0x1100 |
| 1184 | + : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000 |
| 1185 | + : h[k+5] > h[k+7] ? 0x0100 : 0x1100) |
| 1186 | + | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000 |
| 1187 | + : h[k+8] >h[k+11] ? 0x00 : 0x110000 |
| 1188 | + : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000 |
| 1189 | + : h[k+9] >h[k+11] ? 0x010000 : 0x110000) |
| 1190 | + | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000 |
| 1191 | + : h[k+12] >h[k+15] ? 0x00 : 0x11000000 |
| 1192 | + : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000 |
| 1193 | + : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000); |
| 1194 | + } |
| 1195 | + } |
| 1196 | + |
| 1197 | + std::vector<int> new_centroids(indices_length); |
| 1198 | + std::vector<DistanceType> dists(indices_length); |
| 1199 | + |
| 1200 | + // reassign points to clusters |
| 1201 | + KMeansDistanceComputer<ElementType**> invoker( |
| 1202 | + distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists); |
| 1203 | + parallel_for_(cv::Range(0, (int)indices_length), invoker); |
| 1204 | + |
| 1205 | + for (int i=0; i < indices_length; ++i) { |
| 1206 | + DistanceType dist(dists[i]); |
| 1207 | + int new_centroid(new_centroids[i]); |
| 1208 | + if (dist > radiuses[new_centroid]) { |
| 1209 | + radiuses[new_centroid] = dist; |
| 1210 | + } |
| 1211 | + if (new_centroid != belongs_to[i]) { |
| 1212 | + count[belongs_to[i]]--; |
| 1213 | + count[new_centroid]++; |
| 1214 | + belongs_to[i] = new_centroid; |
| 1215 | + converged = false; |
| 1216 | + } |
| 1217 | + } |
| 1218 | + |
| 1219 | + for (int i=0; i<branching; ++i) { |
| 1220 | + // if one cluster converges to an empty cluster, |
| 1221 | + // move an element into that cluster |
| 1222 | + if (count[i]==0) { |
| 1223 | + int j = (i+1)%branching; |
| 1224 | + while (count[j]<=1) { |
| 1225 | + j = (j+1)%branching; |
| 1226 | + } |
| 1227 | + |
| 1228 | + for (int k=0; k<indices_length; ++k) { |
| 1229 | + if (belongs_to[k]==j) { |
| 1230 | + // for cluster j, we move the furthest element from the center to the empty cluster i |
| 1231 | + if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) { |
| 1232 | + belongs_to[k] = i; |
| 1233 | + count[j]--; |
| 1234 | + count[i]++; |
| 1235 | + break; |
| 1236 | + } |
| 1237 | + } |
| 1238 | + } |
| 1239 | + converged = false; |
| 1240 | + } |
| 1241 | + } |
| 1242 | + } |
| 1243 | + } |
| 1244 | + |
| 1245 | + |
1054 | 1246 | void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length,
|
1055 | 1247 | int branching, int level, CentersType** centers,
|
1056 | 1248 | std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
|
@@ -1150,7 +1342,7 @@ class KMeansIndex : public NNIndex<Distance>
|
1150 | 1342 | /**
|
1151 | 1343 | * The methods responsible with doing the recursive hierarchical clustering on
|
1152 | 1344 | * binary vectors.
|
1153 |
| - * As some might have heared that KMeans on binary data doesn't make sense, |
| 1345 | + * As some might have heard that KMeans on binary data doesn't make sense, |
1154 | 1346 | * it's worth a little explanation why it actually fairly works. As
|
1155 | 1347 | * with the Hierarchical Clustering algortihm, we seed several centers for the
|
1156 | 1348 | * current node by picking some of its points. Then in a first pass each point
|
@@ -1233,6 +1425,34 @@ class KMeansIndex : public NNIndex<Distance>
|
1233 | 1425 | }
|
1234 | 1426 |
|
1235 | 1427 |
|
| 1428 | + void refineAndSplitClustering( |
| 1429 | + KMeansNodePtr node, int* indices, int indices_length, int branching, |
| 1430 | + int level, CentersType** centers, std::vector<DistanceType>& radiuses, |
| 1431 | + int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier) |
| 1432 | + { |
| 1433 | + (void)identifier; |
| 1434 | + refineDnaClustering( |
| 1435 | + indices, indices_length, branching, centers, radiuses, belongs_to, count); |
| 1436 | + |
| 1437 | + computeAnyBitfieldSubClustering(node, indices, indices_length, branching, |
| 1438 | + level, centers, radiuses, belongs_to, count); |
| 1439 | + } |
| 1440 | + |
| 1441 | + |
| 1442 | + void refineAndSplitClustering( |
| 1443 | + KMeansNodePtr node, int* indices, int indices_length, int branching, |
| 1444 | + int level, CentersType** centers, std::vector<DistanceType>& radiuses, |
| 1445 | + int* belongs_to, int* count, const cvflann::DNAmming2<unsigned char>* identifier) |
| 1446 | + { |
| 1447 | + (void)identifier; |
| 1448 | + refineDnaClustering( |
| 1449 | + indices, indices_length, branching, centers, radiuses, belongs_to, count); |
| 1450 | + |
| 1451 | + computeAnyBitfieldSubClustering(node, indices, indices_length, branching, |
| 1452 | + level, centers, radiuses, belongs_to, count); |
| 1453 | + } |
| 1454 | + |
| 1455 | + |
1236 | 1456 | /**
|
1237 | 1457 | * The method responsible with actually doing the recursive hierarchical
|
1238 | 1458 | * clustering
|
|
0 commit comments