Skip to content

Commit 31dc3e9

Browse files
Merge pull request opencv#18211 from pemmanuelviel:pev--handle-dna-vectors
* DNA-mode: update miniflann to handle DNA * DNA-mode: update hierarchical kmeans to handle DNA sequences
1 parent 698b2bf commit 31dc3e9

File tree

2 files changed

+244
-3
lines changed

2 files changed

+244
-3
lines changed

modules/flann/include/opencv2/flann/kmeans_index.h

Lines changed: 221 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@
5050
#include "logger.h"
5151

5252
#define BITS_PER_CHAR 8
53+
#define BITS_PER_BASE 2 // for DNA/RNA sequences
54+
#define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE)
55+
#define HISTOS_PER_BASE (1<<BITS_PER_BASE)
5356

5457

5558
namespace cvflann
@@ -825,6 +828,73 @@ class KMeansIndex : public NNIndex<Distance>
825828
}
826829

827830

831+
void computeDnaNodeStatistics(KMeansNodePtr node, int* indices,
832+
unsigned int indices_length)
833+
{
834+
const unsigned int histos_veclen = static_cast<unsigned int>(
835+
veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
836+
837+
unsigned long long variance = 0ull;
838+
unsigned int* histograms = new unsigned int[histos_veclen];
839+
memset(histograms, 0, sizeof(unsigned int)*histos_veclen);
840+
841+
for (unsigned int i=0; i<indices_length; ++i) {
842+
variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(
843+
distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_)));
844+
845+
unsigned char* vec = (unsigned char*)dataset_[indices[i]];
846+
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
847+
histograms[k + ((vec[l]) & 0x03)]++;
848+
histograms[k + 4 + ((vec[l]>>2) & 0x03)]++;
849+
histograms[k + 8 + ((vec[l]>>4) & 0x03)]++;
850+
histograms[k +12 + ((vec[l]>>6) & 0x03)]++;
851+
}
852+
}
853+
854+
CentersType* mean = new CentersType[veclen_];
855+
memoryCounter_ += int(veclen_*sizeof(CentersType));
856+
unsigned char* char_mean = (unsigned char*)mean;
857+
unsigned int* h = histograms;
858+
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
859+
char_mean[l] = (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
860+
: h[k] > h[k+3] ? 0x00 : 0x11
861+
: h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
862+
: h[k+1] > h[k+3] ? 0x01 : 0x11)
863+
| (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
864+
: h[k+4] > h[k+7] ? 0x00 : 0x1100
865+
: h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
866+
: h[k+5] > h[k+7] ? 0x0100 : 0x1100)
867+
| (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
868+
: h[k+8] >h[k+11] ? 0x00 : 0x110000
869+
: h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
870+
: h[k+9] >h[k+11] ? 0x010000 : 0x110000)
871+
| (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
872+
: h[k+12] >h[k+15] ? 0x00 : 0x11000000
873+
: h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
874+
: h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
875+
}
876+
variance = static_cast<unsigned long long>(
877+
0.5 + static_cast<double>(variance) / static_cast<double>(indices_length));
878+
variance -= static_cast<unsigned long long>(
879+
ensureSquareDistance<Distance>(
880+
distance_(mean, ZeroIterator<ElementType>(), veclen_)));
881+
882+
DistanceType radius = 0;
883+
for (unsigned int i=0; i<indices_length; ++i) {
884+
DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
885+
if (tmp>radius) {
886+
radius = tmp;
887+
}
888+
}
889+
890+
node->variance = static_cast<DistanceType>(variance);
891+
node->radius = radius;
892+
node->pivot = mean;
893+
894+
delete[] histograms;
895+
}
896+
897+
828898
template<typename DistType>
829899
void computeNodeStatistics(KMeansNodePtr node, int* indices,
830900
unsigned int indices_length,
@@ -858,6 +928,22 @@ class KMeansIndex : public NNIndex<Distance>
858928
computeBitfieldNodeStatistics(node, indices, indices_length);
859929
}
860930

931+
void computeNodeStatistics(KMeansNodePtr node, int* indices,
932+
unsigned int indices_length,
933+
const cvflann::DNAmmingLUT* identifier)
934+
{
935+
(void)identifier;
936+
computeDnaNodeStatistics(node, indices, indices_length);
937+
}
938+
939+
void computeNodeStatistics(KMeansNodePtr node, int* indices,
940+
unsigned int indices_length,
941+
const cvflann::DNAmming2<unsigned char>* identifier)
942+
{
943+
(void)identifier;
944+
computeDnaNodeStatistics(node, indices, indices_length);
945+
}
946+
861947

862948
void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
863949
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
@@ -1051,6 +1137,112 @@ class KMeansIndex : public NNIndex<Distance>
10511137
}
10521138

10531139

1140+
void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers,
1141+
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
1142+
{
1143+
for (int i=0; i<branching; ++i) {
1144+
centers[i] = new CentersType[veclen_];
1145+
memoryCounter_ += (int)(veclen_*sizeof(CentersType));
1146+
}
1147+
1148+
const unsigned int histos_veclen = static_cast<unsigned int>(
1149+
veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
1150+
cv::AutoBuffer<unsigned int> histos_buf(branching*histos_veclen);
1151+
Matrix<unsigned int> histos(histos_buf.data(), branching, histos_veclen);
1152+
1153+
bool converged = false;
1154+
int iteration = 0;
1155+
while (!converged && iteration<iterations_) {
1156+
converged = true;
1157+
iteration++;
1158+
1159+
// compute the new cluster centers
1160+
for (int i=0; i<branching; ++i) {
1161+
memset(histos[i],0,sizeof(unsigned int)*histos_veclen);
1162+
radiuses[i] = 0;
1163+
}
1164+
for (int i=0; i<indices_length; ++i) {
1165+
unsigned char* vec = (unsigned char*)dataset_[indices[i]];
1166+
unsigned int* h = histos[belongs_to[i]];
1167+
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
1168+
h[k + ((vec[l]) & 0x03)]++;
1169+
h[k + 4 + ((vec[l]>>2) & 0x03)]++;
1170+
h[k + 8 + ((vec[l]>>4) & 0x03)]++;
1171+
h[k +12 + ((vec[l]>>6) & 0x03)]++;
1172+
}
1173+
}
1174+
for (int i=0; i<branching; ++i) {
1175+
unsigned int* h = histos[i];
1176+
unsigned char* charCenter = (unsigned char*)centers[i];
1177+
for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
1178+
charCenter[l]= (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
1179+
: h[k] > h[k+3] ? 0x00 : 0x11
1180+
: h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
1181+
: h[k+1] > h[k+3] ? 0x01 : 0x11)
1182+
| (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
1183+
: h[k+4] > h[k+7] ? 0x00 : 0x1100
1184+
: h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
1185+
: h[k+5] > h[k+7] ? 0x0100 : 0x1100)
1186+
| (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
1187+
: h[k+8] >h[k+11] ? 0x00 : 0x110000
1188+
: h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
1189+
: h[k+9] >h[k+11] ? 0x010000 : 0x110000)
1190+
| (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
1191+
: h[k+12] >h[k+15] ? 0x00 : 0x11000000
1192+
: h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
1193+
: h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
1194+
}
1195+
}
1196+
1197+
std::vector<int> new_centroids(indices_length);
1198+
std::vector<DistanceType> dists(indices_length);
1199+
1200+
// reassign points to clusters
1201+
KMeansDistanceComputer<ElementType**> invoker(
1202+
distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
1203+
parallel_for_(cv::Range(0, (int)indices_length), invoker);
1204+
1205+
for (int i=0; i < indices_length; ++i) {
1206+
DistanceType dist(dists[i]);
1207+
int new_centroid(new_centroids[i]);
1208+
if (dist > radiuses[new_centroid]) {
1209+
radiuses[new_centroid] = dist;
1210+
}
1211+
if (new_centroid != belongs_to[i]) {
1212+
count[belongs_to[i]]--;
1213+
count[new_centroid]++;
1214+
belongs_to[i] = new_centroid;
1215+
converged = false;
1216+
}
1217+
}
1218+
1219+
for (int i=0; i<branching; ++i) {
1220+
// if one cluster converges to an empty cluster,
1221+
// move an element into that cluster
1222+
if (count[i]==0) {
1223+
int j = (i+1)%branching;
1224+
while (count[j]<=1) {
1225+
j = (j+1)%branching;
1226+
}
1227+
1228+
for (int k=0; k<indices_length; ++k) {
1229+
if (belongs_to[k]==j) {
1230+
// for cluster j, we move the furthest element from the center to the empty cluster i
1231+
if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
1232+
belongs_to[k] = i;
1233+
count[j]--;
1234+
count[i]++;
1235+
break;
1236+
}
1237+
}
1238+
}
1239+
converged = false;
1240+
}
1241+
}
1242+
}
1243+
}
1244+
1245+
10541246
void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length,
10551247
int branching, int level, CentersType** centers,
10561248
std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
@@ -1150,7 +1342,7 @@ class KMeansIndex : public NNIndex<Distance>
11501342
/**
11511343
* The methods responsible with doing the recursive hierarchical clustering on
11521344
* binary vectors.
1153-
* As some might have heared that KMeans on binary data doesn't make sense,
1345+
* As some might have heard that KMeans on binary data doesn't make sense,
11541346
* it's worth a little explanation why it actually fairly works. As
11551347
* with the Hierarchical Clustering algortihm, we seed several centers for the
11561348
* current node by picking some of its points. Then in a first pass each point
@@ -1233,6 +1425,34 @@ class KMeansIndex : public NNIndex<Distance>
12331425
}
12341426

12351427

1428+
void refineAndSplitClustering(
1429+
KMeansNodePtr node, int* indices, int indices_length, int branching,
1430+
int level, CentersType** centers, std::vector<DistanceType>& radiuses,
1431+
int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier)
1432+
{
1433+
(void)identifier;
1434+
refineDnaClustering(
1435+
indices, indices_length, branching, centers, radiuses, belongs_to, count);
1436+
1437+
computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
1438+
level, centers, radiuses, belongs_to, count);
1439+
}
1440+
1441+
1442+
void refineAndSplitClustering(
1443+
KMeansNodePtr node, int* indices, int indices_length, int branching,
1444+
int level, CentersType** centers, std::vector<DistanceType>& radiuses,
1445+
int* belongs_to, int* count, const cvflann::DNAmming2<unsigned char>* identifier)
1446+
{
1447+
(void)identifier;
1448+
refineDnaClustering(
1449+
indices, indices_length, branching, centers, radiuses, belongs_to, count);
1450+
1451+
computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
1452+
level, centers, radiuses, belongs_to, count);
1453+
}
1454+
1455+
12361456
/**
12371457
* The method responsible with actually doing the recursive hierarchical
12381458
* clustering

modules/flann/src/miniflann.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ typedef ::cvflann::Hamming<uchar> HammingDistance;
345345
#else
346346
typedef ::cvflann::HammingLUT HammingDistance;
347347
#endif
348+
typedef ::cvflann::DNAmming2<uchar> DNAmmingDistance;
348349

349350
Index::Index()
350351
{
@@ -397,6 +398,9 @@ void Index::build(InputArray _data, const IndexParams& params, flann_distance_t
397398
buildIndex< ::cvflann::L1<float> >(index, data, params);
398399
break;
399400
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
401+
case FLANN_DIST_DNAMMING:
402+
buildIndex< DNAmmingDistance >(index, data, params);
403+
break;
400404
case FLANN_DIST_MAX:
401405
buildIndex< ::cvflann::MaxDistance<float> >(index, data, params);
402406
break;
@@ -452,6 +456,9 @@ void Index::release()
452456
deleteIndex< ::cvflann::L1<float> >(index);
453457
break;
454458
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
459+
case FLANN_DIST_DNAMMING:
460+
deleteIndex< DNAmmingDistance >(index);
461+
break;
455462
case FLANN_DIST_MAX:
456463
deleteIndex< ::cvflann::MaxDistance<float> >(index);
457464
break;
@@ -573,7 +580,8 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
573580
CV_INSTRUMENT_REGION();
574581

575582
Mat query = _query.getMat(), indices, dists;
576-
int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
583+
int dtype = (distType == FLANN_DIST_HAMMING)
584+
|| (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
577585

578586
createIndicesDists( _indices, _dists, indices, dists, query.rows, knn, knn, dtype );
579587

@@ -589,6 +597,9 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
589597
runKnnSearch< ::cvflann::L1<float> >(index, query, indices, dists, knn, params);
590598
break;
591599
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
600+
case FLANN_DIST_DNAMMING:
601+
runKnnSearch<DNAmmingDistance>(index, query, indices, dists, knn, params);
602+
break;
592603
case FLANN_DIST_MAX:
593604
runKnnSearch< ::cvflann::MaxDistance<float> >(index, query, indices, dists, knn, params);
594605
break;
@@ -617,7 +628,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
617628
CV_INSTRUMENT_REGION();
618629

619630
Mat query = _query.getMat(), indices, dists;
620-
int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
631+
int dtype = (distType == FLANN_DIST_HAMMING)
632+
|| (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
621633
CV_Assert( maxResults > 0 );
622634
createIndicesDists( _indices, _dists, indices, dists, query.rows, maxResults, INT_MAX, dtype );
623635

@@ -634,6 +646,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
634646
case FLANN_DIST_L1:
635647
return runRadiusSearch< ::cvflann::L1<float> >(index, query, indices, dists, radius, params);
636648
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
649+
case FLANN_DIST_DNAMMING:
650+
return runRadiusSearch< DNAmmingDistance >(index, query, indices, dists, radius, params);
637651
case FLANN_DIST_MAX:
638652
return runRadiusSearch< ::cvflann::MaxDistance<float> >(index, query, indices, dists, radius, params);
639653
case FLANN_DIST_HIST_INTERSECT:
@@ -697,6 +711,9 @@ void Index::save(const String& filename) const
697711
saveIndex< ::cvflann::L1<float> >(this, index, fout);
698712
break;
699713
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
714+
case FLANN_DIST_DNAMMING:
715+
saveIndex< DNAmmingDistance >(this, index, fout);
716+
break;
700717
case FLANN_DIST_MAX:
701718
saveIndex< ::cvflann::MaxDistance<float> >(this, index, fout);
702719
break;
@@ -778,6 +795,7 @@ bool Index::load(InputArray _data, const String& filename)
778795
distType = (flann_distance_t)idistType;
779796

780797
if( !((distType == FLANN_DIST_HAMMING && featureType == CV_8U) ||
798+
(distType == FLANN_DIST_DNAMMING && featureType == CV_8U) ||
781799
(distType != FLANN_DIST_HAMMING && featureType == CV_32F)) )
782800
{
783801
fprintf(stderr, "Reading FLANN index error: unsupported feature type %d for the index type %d\n", featureType, algo);
@@ -797,6 +815,9 @@ bool Index::load(InputArray _data, const String& filename)
797815
loadIndex< ::cvflann::L1<float> >(this, index, data, fin);
798816
break;
799817
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
818+
case FLANN_DIST_DNAMMING:
819+
loadIndex< DNAmmingDistance >(this, index, data, fin);
820+
break;
800821
case FLANN_DIST_MAX:
801822
loadIndex< ::cvflann::MaxDistance<float> >(this, index, data, fin);
802823
break;

0 commit comments

Comments
 (0)