Skip to content

Commit 8ef9a8f

Browse files
committed
Merge branch 'develop'
2 parents 2617e0e + 72eb2b6 commit 8ef9a8f

File tree

12 files changed

+548
-128
lines changed

12 files changed

+548
-128
lines changed

.travis.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,7 @@ before_script:
1111
- cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0 ..
1212

1313
script:
14-
- make -j2 VERBOSE=1
14+
- make -j2 VERBOSE=1
15+
16+
notifications:
17+
email: false

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ if (NOT DEFINED CUDA_ARCH)
1414
set(CUDA_ARCH "61")
1515
endif()
1616
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
17-
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h
17+
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
1818
metric_abstraction.h kmeans.cu knn.cu)
1919
if (NOT DISABLE_PYTHON)
2020
list(APPEND SOURCE_FILES python.cc)
@@ -29,6 +29,7 @@ if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
2929
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11")
3030
endif()
3131
cuda_add_library(KMCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS})
32+
target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
3233
if(PYTHONLIBS_FOUND)
3334
include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
3435
target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,17 @@ have several days and 12 GB of GPU memory); 300K samples are grouped
2525
into 5000 clusters in 4½ minutes on NVIDIA Titan X (15 iterations); 3M samples
2626
and 1000 clusters take 20 minutes (33 iterations). Yinyang can be
2727
turned off to save GPU memory but the slower Lloyd will be used then.
28-
Three centroid initialization schemes are supported: random, k-means++ and import.
29-
Two distance metrics are supported: L2 (the usual one) and angular (refined cosine).
28+
Four centroid initialization schemes are supported: random, k-means++,
29+
[AFKMC2](http://olivierbachem.ch/files/afkmcmc-oral-pdf.pdf) and import.
30+
Two distance metrics are supported: L2 (the usual one) and angular
31+
(arccos of the scalar product). L1 is in development.
3032
16-bit float support delivers 2x memory compression. If you've got several GPUs,
3133
they can be utilized together and it gives the corresponding linear speedup
3234
either for Lloyd or Yinyang.
3335

3436
The code has been thoroughly tested to yield bit-to-bit identical
35-
results from Yinyang and Lloyd.
37+
results from Yinyang and Lloyd. AFKMC2 was converted from
38+
[the reference code](https://github.com/obachem/kmc2).
3639

3740
Read the articles: [1](http://blog.sourced.tech/post/towards_kmeans_on_gpu/),
3841
[2](https://blog.sourced.tech/post/kmcuda4/).

kmcuda.cc

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
#include <cstdlib>
33
#include <cstring>
44
#include <cinttypes>
5-
#include <cfloat>
65
#include <cmath>
76
#include <cassert>
87
#include <algorithm>
8+
#include <map>
99
#include <memory>
1010

1111
#include <cuda_runtime_api.h>
@@ -189,11 +189,11 @@ static KMCUDAResult print_memory_stats(const std::vector<int> &devs) {
189189
extern "C" {
190190

191191
KMCUDAResult kmeans_init_centroids(
192-
KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size,
193-
uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed,
194-
const std::vector<int> &devs, int device_ptrs, int fp16x2, int32_t verbosity,
195-
const float *host_centroids, const udevptrs<float> &samples,
196-
udevptrs<float> *dists, udevptrs<float> *dev_sums, udevptrs<float> *centroids) {
192+
KMCUDAInitMethod method, const void *init_params, uint32_t samples_size,
193+
uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric,
194+
uint32_t seed, const std::vector<int> &devs, int device_ptrs, int fp16x2,
195+
int32_t verbosity, const float *host_centroids, const udevptrs<float> &samples,
196+
udevptrs<float> *dists, udevptrs<float> *aux, udevptrs<float> *centroids) {
197197
srand(seed);
198198
switch (method) {
199199
case kmcudaInitMethodImport:
@@ -237,7 +237,7 @@ KMCUDAResult kmeans_init_centroids(
237237
);
238238
break;
239239
}
240-
case kmcudaInitMethodPlusPlus:
240+
case kmcudaInitMethodPlusPlus: {
241241
INFO("performing kmeans++...\n");
242242
float smoke = NAN;
243243
uint32_t first_offset;
@@ -253,9 +253,9 @@ KMCUDAResult kmeans_init_centroids(
253253
printf("kmeans++: dump %" PRIu32 " %" PRIu32 " %p\n",
254254
samples_size, features_size, host_dists.get());
255255
FOR_EACH_DEVI(
256-
printf("kmeans++: dev #%d: %p %p %p %p\n", devs[devi],
256+
printf("kmeans++: dev #%d: %p %p %p\n", devs[devi],
257257
samples[devi].get(), (*centroids)[devi].get(),
258-
(*dists)[devi].get(), (*dev_sums)[devi].get());
258+
(*dists)[devi].get());
259259
);
260260
}
261261
for (uint32_t i = 1; i < clusters_size; i++) {
@@ -267,7 +267,7 @@ KMCUDAResult kmeans_init_centroids(
267267
float dist_sum = 0;
268268
RETERR(kmeans_cuda_plus_plus(
269269
samples_size, features_size, i, metric, devs, fp16x2, verbosity,
270-
samples, centroids, dists, dev_sums, host_dists.get(), &dist_sum),
270+
samples, centroids, dists, host_dists.get(), &dist_sum),
271271
DEBUG("\nkmeans_cuda_plus_plus failed\n"));
272272
if (dist_sum != dist_sum) {
273273
assert(dist_sum == dist_sum);
@@ -307,21 +307,79 @@ KMCUDAResult kmeans_init_centroids(
307307
(j - 1) * features_size, features_size);
308308
}
309309
break;
310+
}
311+
case kmcudaInitMethodAFKMC2: {
312+
uint32_t m = *reinterpret_cast<const uint32_t*>(init_params);
313+
if (m == 0) {
314+
m = 200;
315+
} else if (m > samples_size / 2) {
316+
INFO("afkmc2: m > %" PRIu32 " is not supported (got %" PRIu32 ")\n",
317+
samples_size / 2, m);
318+
return kmcudaInvalidArguments;
319+
}
320+
float smoke = NAN;
321+
uint32_t first_offset;
322+
while (smoke != smoke) {
323+
first_offset = (rand() % samples_size) * features_size;
324+
cudaSetDevice(devs[0]);
325+
CUCH(cudaMemcpy(&smoke, samples[0].get() + first_offset, sizeof(float),
326+
cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
327+
}
328+
INFO("afkmc2: calculating q (c0 = %" PRIu32 ")... ",
329+
first_offset / features_size);
330+
CUMEMCPY_D2D_ASYNC(*centroids, 0, samples, first_offset, features_size);
331+
auto q = std::unique_ptr<float[]>(new float[samples_size]);
332+
kmeans_cuda_afkmc2_calc_q(
333+
samples_size, features_size, first_offset / features_size, metric,
334+
devs, fp16x2, verbosity, samples, dists, q.get());
335+
INFO("done\n");
336+
auto cand_ind = std::unique_ptr<uint32_t[]>(new uint32_t[m]);
337+
auto rand_a = std::unique_ptr<float[]>(new float[m]);
338+
auto p_cand = std::unique_ptr<float[]>(new float[m]);
339+
for (uint32_t k = 1; k < clusters_size; k++) {
340+
if (verbosity > 1 || (verbosity > 0 && (
341+
clusters_size < 100 || k % (clusters_size / 100) == 0))) {
342+
printf("\rstep %d", k);
343+
fflush(stdout);
344+
}
345+
RETERR(kmeans_cuda_afkmc2_random_step(
346+
k, m, seed, verbosity, dists->back().get(),
347+
reinterpret_cast<uint32_t*>(aux->back().get()),
348+
cand_ind.get(), aux->back().get() + m, rand_a.get()));
349+
RETERR(kmeans_cuda_afkmc2_min_dist(
350+
k, m, metric, fp16x2, verbosity, samples.back().get(),
351+
reinterpret_cast<uint32_t*>(aux->back().get()),
352+
centroids->back().get(), aux->back().get() + m, p_cand.get()));
353+
float curr_prob = 0;
354+
uint32_t curr_ind = 0;
355+
for (uint32_t j = 0; j < m; j++) {
356+
auto cand_prob = p_cand[j] / q[cand_ind[j]];
357+
if (curr_prob == 0 || cand_prob / curr_prob > rand_a[j]) {
358+
curr_ind = j;
359+
curr_prob = cand_prob;
360+
}
361+
}
362+
CUMEMCPY_D2D_ASYNC(*centroids, k * features_size,
363+
samples, cand_ind[curr_ind] * features_size,
364+
features_size);
365+
}
366+
break;
367+
}
310368
}
311369
INFO("\rdone \n");
312370
return kmcudaSuccess;
313371
}
314372

315373
KMCUDAResult kmeans_cuda(
316-
KMCUDAInitMethod init, float tolerance, float yinyang_t,
374+
KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
317375
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
318376
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
319377
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
320378
uint32_t *assignments, float *average_distance) {
321-
DEBUG("arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
322-
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, tolerance,
323-
yinyang_t, metric, samples_size, features_size, clusters_size, seed,
324-
device, fp16x2, verbosity, samples, centroids, assignments,
379+
DEBUG("arguments: %d %p %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
380+
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, init_params,
381+
tolerance, yinyang_t, metric, samples_size, features_size, clusters_size,
382+
seed, device, fp16x2, verbosity, samples, centroids, assignments,
325383
average_distance);
326384
RETERR(check_kmeans_args(
327385
tolerance, yinyang_t, samples_size, features_size, clusters_size,
@@ -392,8 +450,8 @@ KMCUDAResult kmeans_cuda(
392450
FOR_EACH_DEV(cudaProfilerStart());
393451
#endif
394452
RETERR(kmeans_init_centroids(
395-
init, samples_size, features_size, clusters_size, metric, seed, devs,
396-
device_ptrs, fp16x2, verbosity, centroids, device_samples,
453+
init, init_params, samples_size, features_size, clusters_size, metric,
454+
seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples,
397455
reinterpret_cast<udevptrs<float>*>(&device_assignments),
398456
reinterpret_cast<udevptrs<float>*>(&device_assignments_prev),
399457
&device_centroids),

kmcuda.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ enum KMCUDAResult {
1515
enum KMCUDAInitMethod {
1616
kmcudaInitMethodRandom = 0,
1717
kmcudaInitMethodPlusPlus,
18+
kmcudaInitMethodAFKMC2,
1819
kmcudaInitMethodImport
1920
};
2021

@@ -27,6 +28,10 @@ extern "C" {
2728

2829
/// @brief Performs K-means clustering on GPU / CUDA.
2930
/// @param init centroids initialization method.
31+
/// @param init_params pointer to a struct / number with centroid initialization
32+
/// parameters. Ignored unless init == kmcudaInitMethodAFKMC2.
33+
/// In case with kmcudaInitMethodAFKMC2 it is expected to be
34+
/// uint32_t* to m; m == 0 means the default value (200).
3035
/// @param tolerance if the number of reassignments drop below this ratio, stop.
3136
/// @param yinyang_t the relative number of cluster groups, usually 0.1.
3237
/// @param metric the distance metric to use. The default is Euclidean (L2), can be
@@ -53,7 +58,7 @@ extern "C" {
5358
/// the corresponding centroids. If nullptr, not calculated.
5459
/// @return KMCUDAResult.
5560
KMCUDAResult kmeans_cuda(
56-
KMCUDAInitMethod init, float tolerance, float yinyang_t,
61+
KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
5762
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
5863
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
5964
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,

0 commit comments

Comments
 (0)