22#include < cstdlib>
33#include < cstring>
44#include < cinttypes>
5- #include < cfloat>
65#include < cmath>
76#include < cassert>
87#include < algorithm>
8+ #include < map>
99#include < memory>
1010
1111#include < cuda_runtime_api.h>
@@ -189,11 +189,11 @@ static KMCUDAResult print_memory_stats(const std::vector<int> &devs) {
189189extern " C" {
190190
191191KMCUDAResult kmeans_init_centroids (
192- KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size ,
193- uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed ,
194- const std::vector<int > &devs, int device_ptrs, int fp16x2, int32_t verbosity ,
195- const float *host_centroids, const udevptrs<float > &samples,
196- udevptrs<float > *dists, udevptrs<float > *dev_sums , udevptrs<float > *centroids) {
192+ KMCUDAInitMethod method, const void *init_params, uint32_t samples_size ,
193+ uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric,
194+ uint32_t seed, const std::vector<int > &devs, int device_ptrs, int fp16x2,
195+ int32_t verbosity, const float *host_centroids, const udevptrs<float > &samples,
196+ udevptrs<float > *dists, udevptrs<float > *aux , udevptrs<float > *centroids) {
197197 srand (seed);
198198 switch (method) {
199199 case kmcudaInitMethodImport:
@@ -237,7 +237,7 @@ KMCUDAResult kmeans_init_centroids(
237237 );
238238 break ;
239239 }
240- case kmcudaInitMethodPlusPlus:
240+ case kmcudaInitMethodPlusPlus: {
241241 INFO (" performing kmeans++...\n " );
242242 float smoke = NAN;
243243 uint32_t first_offset;
@@ -253,9 +253,9 @@ KMCUDAResult kmeans_init_centroids(
253253 printf (" kmeans++: dump %" PRIu32 " %" PRIu32 " %p\n " ,
254254 samples_size, features_size, host_dists.get ());
255255 FOR_EACH_DEVI (
256- printf (" kmeans++: dev #%d: %p %p %p %p \n " , devs[devi],
256+ printf (" kmeans++: dev #%d: %p %p %p\n " , devs[devi],
257257 samples[devi].get (), (*centroids)[devi].get (),
258- (*dists)[devi].get (), (*dev_sums)[devi]. get () );
258+ (*dists)[devi].get ());
259259 );
260260 }
261261 for (uint32_t i = 1 ; i < clusters_size; i++) {
@@ -267,7 +267,7 @@ KMCUDAResult kmeans_init_centroids(
267267 float dist_sum = 0 ;
268268 RETERR (kmeans_cuda_plus_plus (
269269 samples_size, features_size, i, metric, devs, fp16x2, verbosity,
270- samples, centroids, dists, dev_sums, host_dists.get (), &dist_sum),
270+ samples, centroids, dists, host_dists.get (), &dist_sum),
271271 DEBUG (" \n kmeans_cuda_plus_plus failed\n " ));
272272 if (dist_sum != dist_sum) {
273273 assert (dist_sum == dist_sum);
@@ -307,21 +307,79 @@ KMCUDAResult kmeans_init_centroids(
307307 (j - 1 ) * features_size, features_size);
308308 }
309309 break ;
310+ }
311+ case kmcudaInitMethodAFKMC2: {
312+ uint32_t m = *reinterpret_cast <const uint32_t *>(init_params);
313+ if (m == 0 ) {
314+ m = 200 ;
315+ } else if (m > samples_size / 2 ) {
316+ INFO (" afkmc2: m > %" PRIu32 " is not supported (got %" PRIu32 " )\n " ,
317+ samples_size / 2 , m);
318+ return kmcudaInvalidArguments;
319+ }
320+ float smoke = NAN;
321+ uint32_t first_offset;
322+ while (smoke != smoke) {
323+ first_offset = (rand () % samples_size) * features_size;
324+ cudaSetDevice (devs[0 ]);
325+ CUCH (cudaMemcpy (&smoke, samples[0 ].get () + first_offset, sizeof (float ),
326+ cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
327+ }
328+ INFO (" afkmc2: calculating q (c0 = %" PRIu32 " )... " ,
329+ first_offset / features_size);
330+ CUMEMCPY_D2D_ASYNC (*centroids, 0 , samples, first_offset, features_size);
331+ auto q = std::unique_ptr<float []>(new float [samples_size]);
332+ kmeans_cuda_afkmc2_calc_q (
333+ samples_size, features_size, first_offset / features_size, metric,
334+ devs, fp16x2, verbosity, samples, dists, q.get ());
335+ INFO (" done\n " );
336+ auto cand_ind = std::unique_ptr<uint32_t []>(new uint32_t [m]);
337+ auto rand_a = std::unique_ptr<float []>(new float [m]);
338+ auto p_cand = std::unique_ptr<float []>(new float [m]);
339+ for (uint32_t k = 1 ; k < clusters_size; k++) {
340+ if (verbosity > 1 || (verbosity > 0 && (
341+ clusters_size < 100 || k % (clusters_size / 100 ) == 0 ))) {
342+ printf (" \r step %d" , k);
343+ fflush (stdout);
344+ }
345+ RETERR (kmeans_cuda_afkmc2_random_step (
346+ k, m, seed, verbosity, dists->back ().get (),
347+ reinterpret_cast <uint32_t *>(aux->back ().get ()),
348+ cand_ind.get (), aux->back ().get () + m, rand_a.get ()));
349+ RETERR (kmeans_cuda_afkmc2_min_dist (
350+ k, m, metric, fp16x2, verbosity, samples.back ().get (),
351+ reinterpret_cast <uint32_t *>(aux->back ().get ()),
352+ centroids->back ().get (), aux->back ().get () + m, p_cand.get ()));
353+ float curr_prob = 0 ;
354+ uint32_t curr_ind = 0 ;
355+ for (uint32_t j = 0 ; j < m; j++) {
356+ auto cand_prob = p_cand[j] / q[cand_ind[j]];
357+ if (curr_prob == 0 || cand_prob / curr_prob > rand_a[j]) {
358+ curr_ind = j;
359+ curr_prob = cand_prob;
360+ }
361+ }
362+ CUMEMCPY_D2D_ASYNC (*centroids, k * features_size,
363+ samples, cand_ind[curr_ind] * features_size,
364+ features_size);
365+ }
366+ break ;
367+ }
310368 }
311369 INFO (" \r done \n " );
312370 return kmcudaSuccess;
313371}
314372
315373KMCUDAResult kmeans_cuda (
316- KMCUDAInitMethod init, float tolerance, float yinyang_t ,
374+ KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t ,
317375 KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
318376 uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
319377 int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
320378 uint32_t *assignments, float *average_distance) {
321- DEBUG (" arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
322- PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n " , init, tolerance ,
323- yinyang_t , metric, samples_size, features_size, clusters_size, seed ,
324- device, fp16x2, verbosity, samples, centroids, assignments,
379+ DEBUG (" arguments: %d %p % .3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
380+ PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n " , init, init_params ,
381+ tolerance, yinyang_t , metric, samples_size, features_size, clusters_size,
382+ seed, device, fp16x2, verbosity, samples, centroids, assignments,
325383 average_distance);
326384 RETERR (check_kmeans_args (
327385 tolerance, yinyang_t , samples_size, features_size, clusters_size,
@@ -392,8 +450,8 @@ KMCUDAResult kmeans_cuda(
392450 FOR_EACH_DEV (cudaProfilerStart ());
393451 #endif
394452 RETERR (kmeans_init_centroids (
395- init, samples_size, features_size, clusters_size, metric, seed, devs ,
396- device_ptrs, fp16x2, verbosity, centroids, device_samples,
453+ init, init_params, samples_size, features_size, clusters_size, metric,
454+ seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples,
397455 reinterpret_cast <udevptrs<float >*>(&device_assignments),
398456 reinterpret_cast <udevptrs<float >*>(&device_assignments_prev),
399457 &device_centroids),
0 commit comments